Merge 9bd65faa85 into 021f16f775

2026-05-12 16:19:44 -06:00 · 2026-05-12 19:24:24 +02:00 · 2026-05-12 19:24:24 +02:00 · 80970d092a
commit 80970d092a
parent 021f16f775 9bd65faa85
9 changed files with 553 additions and 18 deletions
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -543,9 +543,15 @@ public:
 	// Add module (path to obj cache dir)
 	void add(std::unique_ptr<llvm::Module> _module, const std::string& path);

+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error);
+
 	// Add module (not cached)
 	void add(std::unique_ptr<llvm::Module> _module);

+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_add(std::unique_ptr<llvm::Module> _module, std::string& error);
+
 	// Add object (path to obj file)
 	bool add(const std::string& path);

@ -558,6 +564,9 @@ public:
 	// Finalize
 	void fin();

+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_fin(std::string& error);
+
 	// Get compiled function address
 	u64 get(const std::string& name);

--- a/Utilities/JITLLVM.cpp
+++ b/Utilities/JITLLVM.cpp
@ -12,6 +12,10 @@

 #include <charconv>

+#if defined(__APPLE__)
+#include <pthread.h>
+#endif
+
 LOG_CHANNEL(jit_log, "JIT");

 #ifdef LLVM_AVAILABLE
@ -50,6 +54,44 @@ LOG_CHANNEL(jit_log, "JIT");
 #include "Emu/CPU/Backends/AArch64/AArch64Common.h"
 #endif

+namespace
+{
+	thread_local std::string* g_llvm_fatal_message = nullptr;
+
+	template <typename F>
+	bool run_recoverable_llvm(F&& func, std::string& error)
+	{
+		error.clear();
+
+		// Run LLVM codegen in a disposable thread. If LLVM invokes the fatal
+		// handler, only this helper thread exits.
+		named_thread worker("LLVM JIT", [&]()
+		{
+#if defined(__APPLE__)
+			pthread_jit_write_protect_np(false);
+#endif
+			g_llvm_fatal_message = &error;
+
+			std::forward<F>(func)();
+
+			g_llvm_fatal_message = nullptr;
+#if defined(__APPLE__)
+			pthread_jit_write_protect_np(true);
+#endif
+		});
+
+		worker();
+		const bool result = static_cast<thread_state>(worker) == thread_state::finished;
+
+		if (!result && error.empty())
+		{
+			error = "LLVM crash recovery invoked";
+		}
+
+		return result;
+	}
+}
+
 const bool jit_initialize = []() -> bool
 {
 	llvm::InitializeNativeTarget();
@ -649,6 +691,13 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 		llvm::install_fatal_error_handler([](void*, const char* msg, bool)
 		{
 			const std::string_view out = msg ? msg : "";
+
+			if (g_llvm_fatal_message)
+			{
+				*g_llvm_fatal_message = out;
+				thread_ctrl::silent_exit();
+			}
+
 			fmt::throw_exception("LLVM Emergency Exit Invoked: '%s'", out);
 		}, nullptr);

@ -788,6 +837,33 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module, const std::string&
 	}
 }

+bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error)
+{
+	ObjectCache cache{path, this};
+	m_engine->setObjectCache(&cache);
+
+	const auto ptr = _module.get();
+	m_engine->addModule(std::move(_module));
+
+	if (!run_recoverable_llvm([&]()
+	{
+		m_engine->generateCodeForModule(ptr);
+	}, error))
+	{
+		return false;
+	}
+
+	m_engine->setObjectCache(nullptr);
+
+	for (auto& func : ptr->functions())
+	{
+		// Delete IR to lower memory consumption
+		func.deleteBody();
+	}
+
+	return true;
+}
+
 void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
 {
 	const auto ptr = _module.get();
@ -801,6 +877,28 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
 	}
 }

+bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, std::string& error)
+{
+	const auto ptr = _module.get();
+	m_engine->addModule(std::move(_module));
+
+	if (!run_recoverable_llvm([&]()
+	{
+		m_engine->generateCodeForModule(ptr);
+	}, error))
+	{
+		return false;
+	}
+
+	for (auto& func : ptr->functions())
+	{
+		// Delete IR to lower memory consumption
+		func.deleteBody();
+	}
+
+	return true;
+}
+
 bool jit_compiler::add(const std::string& path)
 {
 	auto cache = ObjectCache::load(path);
@ -852,6 +950,14 @@ void jit_compiler::fin()
 	m_engine->finalizeObject();
 }

+bool jit_compiler::try_fin(std::string& error)
+{
+	return run_recoverable_llvm([&]()
+	{
+		m_engine->finalizeObject();
+	}, error);
+}
+
 u64 jit_compiler::get(const std::string& name)
 {
 	return m_engine->getGlobalValueAddress(name);
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@ -2988,6 +2988,32 @@ void thread_ctrl::set_name(std::string name)
 	report_fatal_error(reason);
 }

+void thread_ctrl::silent_exit() noexcept
+{
+	if (const auto _this = g_tls_this_thread)
+	{
+		g_tls_error_callback();
+
+		u64 _self = _this->finalize(thread_state::errored);
+
+		if (_self == umax)
+		{
+			// Unused, detached thread support remnant
+			delete _this;
+		}
+
+		thread_base::finalize(umax);
+	}
+
+#ifdef _WIN32
+	_endthreadex(0);
+#else
+	pthread_exit(nullptr);
+#endif
+
+	std::abort();
+}
+
 void thread_ctrl::detect_cpu_layout()
 {
 	if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@ -315,6 +315,9 @@ public:
 	// Exit.
 	[[noreturn]] static void emergency_exit(std::string_view reason);

+	// Exit the current named thread as errored without reporting a fatal error.
+	[[noreturn]] static void silent_exit() noexcept;
+
 	// Get current thread (may be nullptr)
 	static thread_base* get_current()
 	{
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -3120,6 +3120,9 @@ protected:

 	// ARMv8 SDOT/UDOT
 	bool m_use_dotprod = false;
+
+	// Allow direct TBL2/TBX2 emission.
+	bool m_use_tbl2 = true;
 #else
 	// Allow FMA
 	bool m_use_fma = false;
@ -4030,6 +4033,131 @@ public:
 		});
 	}

+#ifdef ARCH_ARM64
+	template <typename T1, typename T2>
+	value_t<u8[16]> tbl(T1 a, T2 b)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto index = b.eval(m_ir);
+		const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
+				}
+			}
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
+				result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
+				return result;
+			}
+		}
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		if (m_use_tbl2)
+		{
+			if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+			{
+				v128 mask{};
+				v128 bitmask{};
+				const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+				if (cv)
+				{
+					for (u32 i = 0; i < 16; i++)
+					{
+						const u64 b_val = cv->getElementAsInteger(i);
+						mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
+						bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+					}
+				}
+				else if (llvm::isa<llvm::ConstantAggregateZero>(c))
+				{
+					bitmask = v128::from8p(0xFF);
+				}
+
+				if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+				{
+					auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+					auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
+					auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
+
+					auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
+					result.value = m_ir->CreateAnd(lookup, z_mask);
+					return result;
+				}
+			}
+
+			result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+			return result;
+		}
+
+		const auto data0_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+		const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
+		const auto data1_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data1, data1_index });
+
+		result.value = m_ir->CreateOr(data0_lookup, data1_lookup);
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3, typename T4>
+	value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		if (m_use_tbl2)
+		{
+			result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+			return result;
+		}
+
+		const auto first_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { first_lookup, data1, data1_index });
+		return result;
+	}
+#endif
+
 	// (m << 3) >= 0 ? a : b
 	template <typename T, typename U, typename V>
 	static auto select_by_bit4(T&& m, U&& a, V&& b)
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -1642,6 +1642,16 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 {
 	const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);

+#ifdef ARCH_ARM64
+	if (op.ra == op.rb)
+	{
+		set_vr(op.vd, tbl(a, (~c & 0xf)));
+		return;
+	}
+
+	set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
+	return;
+#else
 	if (op.ra == op.rb)
 	{
 		set_vr(op.vd, pshufb(a, ~c & 0xf));
@ -1657,6 +1667,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)

 	const auto i = eval(~c & 0x1f);
 	set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
+#endif
 }

 void PPUTranslator::VPKPX(ppu_opcode_t op)
--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@ -82,6 +82,94 @@ void fmt_class_string<spu_recompiler_base::compare_direction>::format(std::strin
 	});
 }

+#ifdef ARCH_ARM64
+constexpr const char s_spu_llvm_reg_scavenge_error[] = "Cannot scavenge register without an emergency spill slot";
+
+class spu_llvm_compile_scope
+{
+public:
+	spu_llvm_compile_scope(spu_llvm_compile_context& context, bool use_tbl2) noexcept
+	{
+		context = {};
+		context.use_tbl2 = use_tbl2;
+		spu_llvm_set_compile_context(&context);
+	}
+
+	~spu_llvm_compile_scope() noexcept
+	{
+		spu_llvm_set_compile_context(nullptr);
+	}
+};
+
+static spu_program analyse_spu_llvm_program(spu_recompiler_base& compiler, const spu_program& program)
+{
+	std::vector<be_t<u32>> ls(SPU_LS_SIZE / sizeof(be_t<u32>));
+
+	for (u32 i = 0, pos = program.lower_bound; i < program.data.size(); i++, pos += 4)
+	{
+		ls[pos / 4] = std::bit_cast<be_t<u32>>(program.data[i]);
+	}
+
+	return compiler.analyse(ls.data(), program.entry_point);
+}
+
+static spu_function_t compile_spu_llvm_with_retry(std::unique_ptr<spu_recompiler_base>& compiler, const spu_program& program)
+{
+	spu_llvm_compile_context context;
+
+	{
+		spu_llvm_compile_scope scope(context, true);
+
+		if (const auto result = compiler->compile(spu_program{program}))
+		{
+			return result;
+		}
+	}
+
+	if (context.llvm_error.find(s_spu_llvm_reg_scavenge_error) == std::string::npos)
+	{
+		if (!context.llvm_error.empty())
+		{
+			spu_log.error("LLVM failed to compile SPU block 0x%x: %s", program.entry_point, context.llvm_error);
+		}
+
+		return nullptr;
+	}
+
+	spu_log.warning("LLVM failed to compile SPU block 0x%x with TBL2/TBX2: %s. Retrying without TBL2/TBX2.", program.entry_point, context.llvm_error);
+
+	// LLVM fatal recovery does not unwind MCJIT state. Abandon the failed
+	// compiler and retry from a fresh analysis/JIT instance.
+	static_cast<void>(compiler.release());
+	compiler = spu_recompiler_base::make_llvm_recompiler();
+	compiler->init();
+
+	const auto retry_program = analyse_spu_llvm_program(*compiler, program);
+
+	if (retry_program != program)
+	{
+		spu_log.error("[0x%05x] SPU analyser failed during TBL2/TBX2 retry, %u vs %u", retry_program.entry_point, retry_program.data.size(), program.data.size());
+		return nullptr;
+	}
+
+	spu_llvm_compile_context retry_context;
+	spu_llvm_compile_scope scope(retry_context, false);
+
+	const auto result = compiler->compile(spu_program{retry_program});
+
+	if (result)
+	{
+		spu_log.notice("SPU LLVM block 0x%x compiled successfully without TBL2/TBX2.", program.entry_point);
+	}
+	else if (!retry_context.llvm_error.empty())
+	{
+		spu_log.error("LLVM failed to compile SPU block 0x%x without TBL2/TBX2: %s", program.entry_point, retry_context.llvm_error);
+	}
+
+	return result;
+}
+#endif
+
 // Move 4 args for calling native function from a GHC calling convention function
 #if defined(ARCH_X64)
 static u8* move_args_ghc_to_native(u8* raw)
@ -906,6 +994,15 @@ void spu_cache::initialize(bool build_existing_cache)

 		compiler->init();

+		auto compile_program = [&](spu_program&& program) -> spu_function_t
+		{
+#ifdef ARCH_ARM64
+			return compile_spu_llvm_with_retry(compiler, program);
+#else
+			return compiler->compile(std::move(program));
+#endif
+		};
+
 		// Counter for error reporting
 		u32 logged_error = 0;

@ -977,7 +1074,7 @@ void spu_cache::initialize(bool build_existing_cache)
 					logged_error++;
 				}
 			}
-			else if (!compiler->compile(std::move(func2)))
+			else if (!compile_program(std::move(func2)))
 			{
 				// Likely, out of JIT memory. Signal to prevent further building.
 				fail_flag |= 1;
@ -1075,7 +1172,7 @@ void spu_cache::initialize(bool build_existing_cache)
 				const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
 				const u32 prog_size = ::size32(func2.data);

-				if (!compiler->compile(std::move(func2)))
+				if (!compile_program(std::move(func2)))
 				{
 					// Likely, out of JIT memory. Signal to prevent further building.
 					fail_flag |= 1;
@ -2096,7 +2193,12 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 		return;
 	}

-	const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
+	auto program = spu.jit->analyse(spu._ptr<u32>(0), spu.pc);
+#ifdef ARCH_ARM64
+	const auto func = compile_spu_llvm_with_retry(spu.jit, program);
+#else
+	const auto func = spu.jit->compile(std::move(program));
+#endif

 	if (!func)
 	{
@ -8903,8 +9005,20 @@ struct spu_llvm_worker
 			{
 				spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
 			}
-			else if (const auto target = compiler->compile(std::move(func2)))
+			else
 			{
+#ifdef ARCH_ARM64
+				const auto target = compile_spu_llvm_with_retry(compiler, func2);
+#else
+				const auto target = compiler->compile(std::move(func2));
+#endif
+
+				if (!target)
+				{
+					spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
+					break;
+				}
+
 				// Redirect old function (TODO: patch in multiple places)
 				const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;

@ -8922,11 +9036,6 @@ struct spu_llvm_worker

 				atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
 			}
-			else
-			{
-				spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
-				break;
-			}

 			// Clear fake LS
 			std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -62,6 +62,16 @@ const extern spu_decoder<spu_iflag> g_spu_iflag;

 #ifdef ARCH_ARM64
 #include "Emu/CPU/Backends/AArch64/AArch64JIT.h"
+
+namespace
+{
+	thread_local spu_llvm_compile_context* g_spu_llvm_compile_context = nullptr;
+}
+
+void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept
+{
+	g_spu_llvm_compile_context = context;
+}
 #endif

 class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
@ -1669,6 +1679,15 @@ public:
 			m_hash_start = hash_start;
 		}

+#ifdef ARCH_ARM64
+		m_use_tbl2 = !g_spu_llvm_compile_context || g_spu_llvm_compile_context->use_tbl2;
+
+		if (g_spu_llvm_compile_context)
+		{
+			g_spu_llvm_compile_context->llvm_error.clear();
+		}
+#endif
+
 		spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);

 		m_pos = func.lower_bound;
@ -3478,17 +3497,63 @@ public:
 		} _jit_guard;
 #endif

-		if (g_cfg.core.spu_debug)
 		{
-			// Testing only
-			m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
-		}
-		else
-		{
-			m_jit.add(std::move(_module));
-		}
+#ifdef ARCH_ARM64
+			const bool recoverable = !!g_spu_llvm_compile_context;

-		m_jit.fin();
+			if (recoverable)
+			{
+				bool added = false;
+				std::string& llvm_error = g_spu_llvm_compile_context->llvm_error;
+
+				if (g_cfg.core.spu_debug)
+				{
+					// Testing only
+					added = m_jit.try_add(std::move(_module), m_spurt->get_cache_path() + "llvm/", llvm_error);
+				}
+				else
+				{
+					added = m_jit.try_add(std::move(_module), llvm_error);
+				}
+
+				if (!added || !m_jit.try_fin(llvm_error))
+				{
+					if (add_to_file)
+					{
+						add_loc->cached = 0;
+					}
+
+					return nullptr;
+				}
+			}
+			else
+			{
+				if (g_cfg.core.spu_debug)
+				{
+					// Testing only
+					m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
+				}
+				else
+				{
+					m_jit.add(std::move(_module));
+				}
+
+				m_jit.fin();
+			}
+#else
+			if (g_cfg.core.spu_debug)
+			{
+				// Testing only
+				m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
+			}
+			else
+			{
+				m_jit.add(std::move(_module));
+			}
+
+			m_jit.fin();
+#endif
+		}

 		// Register function pointer
 		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
@ -6757,6 +6822,73 @@ public:
 		const auto a = get_vr<u8[16]>(op.ra);
 		const auto b = get_vr<u8[16]>(op.rb);

+#ifdef ARCH_ARM64
+		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
+		{
+			if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
+			{
+				if (op.ra == op.rb)
+				{
+					if (perm_only)
+					{
+						const auto cm = eval(c & 0x0f);
+						set_vr(op.rt4, tbl(as, cm));
+						return;
+					}
+
+					const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+					const auto cm = eval(c & 0x8f);
+					set_vr(op.rt4, tbx(x, as, cm));
+					return;
+				}
+
+				if (perm_only)
+				{
+					const auto cm = eval(c & 0x1f);
+					set_vr(op.rt4, tbl2(as, bs, cm));
+					return;
+				}
+
+				const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+				const auto cm = eval(c & 0x9f);
+				set_vr(op.rt4, tbx2(x, as, bs, cm));
+				return;
+			}
+		}
+
+		if (op.ra == op.rb && !m_interp_magn)
+		{
+			if (perm_only)
+			{
+				const auto cm = eval(c & 0x0f);
+				const auto cr = eval(cm ^ 0x0f);
+				set_vr(op.rt4, tbl(a, cr));
+				return;
+			}
+
+			const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+			const auto cm = eval(c & 0x8f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbx(x, a, cr));
+			return;
+		}
+
+		if (perm_only)
+		{
+			const auto cm = eval(c & 0x9f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbl2(a, b, cr));
+			return;
+		}
+
+		const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+		// AND should be before XOR so that llvm can combine them into BCAX
+		// Though for some reason it doesn't seem to be doing that.
+		const auto cm = eval(c & ~0x60);
+		const auto cr = eval(cm ^ 0x0f);
+		set_vr(op.rt4, tbx2(x, a, b, cr));
+		return;
+#else
 		// Data with swapped endian from a load instruction
 		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
 		{
@ -6900,6 +7032,7 @@ public:
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx));
 		else
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
+#endif
 	}

 	void MPYA(spu_opcode_t op)
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -77,6 +77,16 @@ struct spu_program
 	bool operator<(const spu_program& rhs) const noexcept;
 };

+#ifdef ARCH_ARM64
+struct spu_llvm_compile_context
+{
+	bool use_tbl2 = true;
+	std::string llvm_error;
+};
+
+void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept;
+#endif
+
 class spu_item
 {
 public: