From 44d4d6b05644bf47a096553801ba4cec4f4942c7 Mon Sep 17 00:00:00 2001
From: Malcolm <malcolmjestadt@gmail.com>
Date: Sat, 2 May 2026 16:09:26 -0400
Subject: [PATCH 1/2] PPU/SPU LLVM: Use arm shuffles in recompilers instead of
 emulating x86 pshufb

> - SHUFB from 9 instructions down to 5
> - Though it should be 4 if LLVM would just emit BCAX...
---
 rpcs3/Emu/CPU/CPUTranslator.h        | 106 +++++++++++++++++++++++++++
 rpcs3/Emu/Cell/PPUTranslator.cpp     |  11 +++
 rpcs3/Emu/Cell/SPULLVMRecompiler.cpp |  68 +++++++++++++++++
 3 files changed, 185 insertions(+)
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index ab2aed8156..d047be63bb 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -4030,6 +4030,112 @@ public:
 		});
 	}
 
+#ifdef ARCH_ARM64
+	template <typename T1, typename T2>
+	value_t<u8[16]> tbl(T1 a, T2 b)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto index = b.eval(m_ir);
+		const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
+				}
+			}
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
+				result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
+				return result;
+			}
+		}
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			v128 bitmask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
+					bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+				}
+			}
+			else if (llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				bitmask = v128::from8p(0xFF);
+			}
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
+				auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
+
+				auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
+				result.value = m_ir->CreateAnd(lookup, z_mask);
+				return result;
+			}
+		}
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3, typename T4>
+	value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+		return result;
+	}
+#endif
+
 	// (m << 3) >= 0 ? a : b
 	template <typename T, typename U, typename V>
 	static auto select_by_bit4(T&& m, U&& a, V&& b)
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
index 0205715328..9125aa5eeb 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -1642,6 +1642,16 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 {
 	const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
 
+#ifdef ARCH_ARM64
+	if (op.ra == op.rb)
+	{
+		set_vr(op.vd, tbl(a, (~c & 0xf)));
+		return;
+	}
+
+	set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
+	return;
+#else
 	if (op.ra == op.rb)
 	{
 		set_vr(op.vd, pshufb(a, ~c & 0xf));
@@ -1657,6 +1667,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 
 	const auto i = eval(~c & 0x1f);
 	set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
+#endif
 }
 
 void PPUTranslator::VPKPX(ppu_opcode_t op)
diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
index 927d7ac187..a99762593e 100644
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@@ -6757,6 +6757,73 @@ public:
 		const auto a = get_vr<u8[16]>(op.ra);
 		const auto b = get_vr<u8[16]>(op.rb);
 
+#ifdef ARCH_ARM64
+		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
+		{
+			if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
+			{
+				if (op.ra == op.rb)
+				{
+					if (perm_only)
+					{
+						const auto cm = eval(c & 0x0f);
+						set_vr(op.rt4, tbl(as, cm));
+						return;
+					}
+
+					const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+					const auto cm = eval(c & 0x8f);
+					set_vr(op.rt4, tbx(x, as, cm));
+					return;
+				}
+
+				if (perm_only)
+				{
+					const auto cm = eval(c & 0x1f);
+					set_vr(op.rt4, tbl2(as, bs, cm));
+					return;
+				}
+
+				const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+				const auto cm = eval(c & 0x9f);
+				set_vr(op.rt4, tbx2(x, as, bs, cm));
+				return;
+			}
+		}
+
+		if (op.ra == op.rb && !m_interp_magn)
+		{
+			if (perm_only)
+			{
+				const auto cm = eval(c & 0x0f);
+				const auto cr = eval(cm ^ 0x0f);
+				set_vr(op.rt4, tbl(a, cr));
+				return;
+			}
+
+			const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+			const auto cm = eval(c & 0x8f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbx(x, a, cr));
+			return;
+		}
+
+		if (perm_only)
+		{
+			const auto cm = eval(c & 0x9f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbl2(a, b, cr));
+			return;
+		}
+
+		const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+		// AND should be before XOR so that llvm can combine them into BCAX
+		// Though for some reason it doesn't seem to be doing that.
+		const auto cm = eval(c & ~0x60);
+		const auto cr = eval(cm ^ 0x0f);
+		set_vr(op.rt4, tbx2(x, a, b, cr));
+		return;
+#else
 		// Data with swapped endian from a load instruction
 		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
 		{
@@ -6900,6 +6967,7 @@ public:
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx));
 		else
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
+#endif
 	}
 
 	void MPYA(spu_opcode_t op)

From 9bd65faa85e7d11706f4c4091709785427d3c53b Mon Sep 17 00:00:00 2001
From: Malcolm <malcolmjestadt@gmail.com>
Date: Sat, 2 May 2026 16:10:24 -0400
Subject: [PATCH 2/2] SPU LLVM: Retry ARM64 TBL2 register scavenger failures

- Some SPU programs inexplicably fail to compile when TBL2/TBX2 are used.
- As an insane workaround, first try to compile with TBL2/TBX2, if LLVM crashes while compiling, try to compile the same program without TBL2/TBX2.
---
 Utilities/JIT.h                        |   9 ++
 Utilities/JITLLVM.cpp                  | 106 +++++++++++++++++++++
 Utilities/Thread.cpp                   |  26 +++++
 Utilities/Thread.h                     |   3 +
 rpcs3/Emu/CPU/CPUTranslator.h          |  74 +++++++++-----
 rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 127 +++++++++++++++++++++++--
 rpcs3/Emu/Cell/SPULLVMRecompiler.cpp   |  83 ++++++++++++++--
 rpcs3/Emu/Cell/SPURecompiler.h         |  10 ++
 8 files changed, 394 insertions(+), 44 deletions(-)

diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 86fc72ed55..d18f795563 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -543,9 +543,15 @@ public:
 	// Add module (path to obj cache dir)
 	void add(std::unique_ptr<llvm::Module> _module, const std::string& path);
 
+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error);
+
 	// Add module (not cached)
 	void add(std::unique_ptr<llvm::Module> _module);
 
+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_add(std::unique_ptr<llvm::Module> _module, std::string& error);
+
 	// Add object (path to obj file)
 	bool add(const std::string& path);
 
@@ -558,6 +564,9 @@ public:
 	// Finalize
 	void fin();
 
+	// Returns false after LLVM fatal recovery. The compiler must be discarded.
+	bool try_fin(std::string& error);
+
 	// Get compiled function address
 	u64 get(const std::string& name);
 
diff --git a/Utilities/JITLLVM.cpp b/Utilities/JITLLVM.cpp
index 34e1572185..31fb769c5a 100644
--- a/Utilities/JITLLVM.cpp
+++ b/Utilities/JITLLVM.cpp
@@ -12,6 +12,10 @@
 
 #include <charconv>
 
+#if defined(__APPLE__)
+#include <pthread.h>
+#endif
+
 LOG_CHANNEL(jit_log, "JIT");
 
 #ifdef LLVM_AVAILABLE
@@ -50,6 +54,44 @@ LOG_CHANNEL(jit_log, "JIT");
 #include "Emu/CPU/Backends/AArch64/AArch64Common.h"
 #endif
 
+namespace
+{
+	thread_local std::string* g_llvm_fatal_message = nullptr;
+
+	template <typename F>
+	bool run_recoverable_llvm(F&& func, std::string& error)
+	{
+		error.clear();
+
+		// Run LLVM codegen in a disposable thread. If LLVM invokes the fatal
+		// handler, only this helper thread exits.
+		named_thread worker("LLVM JIT", [&]()
+		{
+#if defined(__APPLE__)
+			pthread_jit_write_protect_np(false);
+#endif
+			g_llvm_fatal_message = &error;
+
+			std::forward<F>(func)();
+
+			g_llvm_fatal_message = nullptr;
+#if defined(__APPLE__)
+			pthread_jit_write_protect_np(true);
+#endif
+		});
+
+		worker();
+		const bool result = static_cast<thread_state>(worker) == thread_state::finished;
+
+		if (!result && error.empty())
+		{
+			error = "LLVM crash recovery invoked";
+		}
+
+		return result;
+	}
+}
+
 const bool jit_initialize = []() -> bool
 {
 	llvm::InitializeNativeTarget();
@@ -649,6 +691,13 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 		llvm::install_fatal_error_handler([](void*, const char* msg, bool)
 		{
 			const std::string_view out = msg ? msg : "";
+
+			if (g_llvm_fatal_message)
+			{
+				*g_llvm_fatal_message = out;
+				thread_ctrl::silent_exit();
+			}
+
 			fmt::throw_exception("LLVM Emergency Exit Invoked: '%s'", out);
 		}, nullptr);
 
@@ -788,6 +837,33 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module, const std::string&
 	}
 }
 
+bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error)
+{
+	ObjectCache cache{path, this};
+	m_engine->setObjectCache(&cache);
+
+	const auto ptr = _module.get();
+	m_engine->addModule(std::move(_module));
+
+	if (!run_recoverable_llvm([&]()
+	{
+		m_engine->generateCodeForModule(ptr);
+	}, error))
+	{
+		return false;
+	}
+
+	m_engine->setObjectCache(nullptr);
+
+	for (auto& func : ptr->functions())
+	{
+		// Delete IR to lower memory consumption
+		func.deleteBody();
+	}
+
+	return true;
+}
+
 void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
 {
 	const auto ptr = _module.get();
@@ -801,6 +877,28 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
 	}
 }
 
+bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, std::string& error)
+{
+	const auto ptr = _module.get();
+	m_engine->addModule(std::move(_module));
+
+	if (!run_recoverable_llvm([&]()
+	{
+		m_engine->generateCodeForModule(ptr);
+	}, error))
+	{
+		return false;
+	}
+
+	for (auto& func : ptr->functions())
+	{
+		// Delete IR to lower memory consumption
+		func.deleteBody();
+	}
+
+	return true;
+}
+
 bool jit_compiler::add(const std::string& path)
 {
 	auto cache = ObjectCache::load(path);
@@ -852,6 +950,14 @@ void jit_compiler::fin()
 	m_engine->finalizeObject();
 }
 
+bool jit_compiler::try_fin(std::string& error)
+{
+	return run_recoverable_llvm([&]()
+	{
+		m_engine->finalizeObject();
+	}, error);
+}
+
 u64 jit_compiler::get(const std::string& name)
 {
 	return m_engine->getGlobalValueAddress(name);
diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp
index 57d7446daf..fee498cc0e 100644
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@@ -2988,6 +2988,32 @@ void thread_ctrl::set_name(std::string name)
 	report_fatal_error(reason);
 }
 
+void thread_ctrl::silent_exit() noexcept
+{
+	if (const auto _this = g_tls_this_thread)
+	{
+		g_tls_error_callback();
+
+		u64 _self = _this->finalize(thread_state::errored);
+
+		if (_self == umax)
+		{
+			// Unused, detached thread support remnant
+			delete _this;
+		}
+
+		thread_base::finalize(umax);
+	}
+
+#ifdef _WIN32
+	_endthreadex(0);
+#else
+	pthread_exit(nullptr);
+#endif
+
+	std::abort();
+}
+
 void thread_ctrl::detect_cpu_layout()
 {
 	if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))
diff --git a/Utilities/Thread.h b/Utilities/Thread.h
index bafcea0b9f..b08f05fe82 100644
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@@ -315,6 +315,9 @@ public:
 	// Exit.
 	[[noreturn]] static void emergency_exit(std::string_view reason);
 
+	// Exit the current named thread as errored without reporting a fatal error.
+	[[noreturn]] static void silent_exit() noexcept;
+
 	// Get current thread (may be nullptr)
 	static thread_base* get_current()
 	{
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index d047be63bb..7a037b2a00 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -3120,6 +3120,9 @@ protected:
 
 	// ARMv8 SDOT/UDOT
 	bool m_use_dotprod = false;
+
+	// Allow direct TBL2/TBX2 emission.
+	bool m_use_tbl2 = true;
 #else
 	// Allow FMA
 	bool m_use_fma = false;
@@ -4074,39 +4077,49 @@ public:
 		const auto data1 = b.eval(m_ir);
 		const auto index = indices.eval(m_ir);
 
-		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		if (m_use_tbl2)
 		{
-			v128 mask{};
-			v128 bitmask{};
-			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
-
-			if (cv)
+			if (auto c = llvm::dyn_cast<llvm::Constant>(index))
 			{
-				for (u32 i = 0; i < 16; i++)
+				v128 mask{};
+				v128 bitmask{};
+				const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+				if (cv)
 				{
-					const u64 b_val = cv->getElementAsInteger(i);
-					mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
-					bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+					for (u32 i = 0; i < 16; i++)
+					{
+						const u64 b_val = cv->getElementAsInteger(i);
+						mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
+						bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+					}
+				}
+				else if (llvm::isa<llvm::ConstantAggregateZero>(c))
+				{
+					bitmask = v128::from8p(0xFF);
+				}
+
+				if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+				{
+					auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+					auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
+					auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
+
+					auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
+					result.value = m_ir->CreateAnd(lookup, z_mask);
+					return result;
 				}
 			}
-			else if (llvm::isa<llvm::ConstantAggregateZero>(c))
-			{
-				bitmask = v128::from8p(0xFF);
-			}
 
-			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
-			{
-				auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
-				auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
-				auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
-
-				auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
-				result.value = m_ir->CreateAnd(lookup, z_mask);
-				return result;
-			}
+			result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+			return result;
 		}
 
-		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+		const auto data0_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+		const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
+		const auto data1_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data1, data1_index });
+
+		result.value = m_ir->CreateOr(data0_lookup, data1_lookup);
 		return result;
 	}
 
@@ -4131,7 +4144,16 @@ public:
 		const auto data1 = b.eval(m_ir);
 		const auto index = indices.eval(m_ir);
 
-		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+		if (m_use_tbl2)
+		{
+			result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+			return result;
+		}
+
+		const auto first_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { first_lookup, data1, data1_index });
 		return result;
 	}
 #endif
diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
index dffca21cae..1f02d6291b 100644
--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@@ -82,6 +82,94 @@ void fmt_class_string<spu_recompiler_base::compare_direction>::format(std::strin
 	});
 }
 
+#ifdef ARCH_ARM64
+constexpr const char s_spu_llvm_reg_scavenge_error[] = "Cannot scavenge register without an emergency spill slot";
+
+class spu_llvm_compile_scope
+{
+public:
+	spu_llvm_compile_scope(spu_llvm_compile_context& context, bool use_tbl2) noexcept
+	{
+		context = {};
+		context.use_tbl2 = use_tbl2;
+		spu_llvm_set_compile_context(&context);
+	}
+
+	~spu_llvm_compile_scope() noexcept
+	{
+		spu_llvm_set_compile_context(nullptr);
+	}
+};
+
+static spu_program analyse_spu_llvm_program(spu_recompiler_base& compiler, const spu_program& program)
+{
+	std::vector<be_t<u32>> ls(SPU_LS_SIZE / sizeof(be_t<u32>));
+
+	for (u32 i = 0, pos = program.lower_bound; i < program.data.size(); i++, pos += 4)
+	{
+		ls[pos / 4] = std::bit_cast<be_t<u32>>(program.data[i]);
+	}
+
+	return compiler.analyse(ls.data(), program.entry_point);
+}
+
+static spu_function_t compile_spu_llvm_with_retry(std::unique_ptr<spu_recompiler_base>& compiler, const spu_program& program)
+{
+	spu_llvm_compile_context context;
+
+	{
+		spu_llvm_compile_scope scope(context, true);
+
+		if (const auto result = compiler->compile(spu_program{program}))
+		{
+			return result;
+		}
+	}
+
+	if (context.llvm_error.find(s_spu_llvm_reg_scavenge_error) == std::string::npos)
+	{
+		if (!context.llvm_error.empty())
+		{
+			spu_log.error("LLVM failed to compile SPU block 0x%x: %s", program.entry_point, context.llvm_error);
+		}
+
+		return nullptr;
+	}
+
+	spu_log.warning("LLVM failed to compile SPU block 0x%x with TBL2/TBX2: %s. Retrying without TBL2/TBX2.", program.entry_point, context.llvm_error);
+
+	// LLVM fatal recovery does not unwind MCJIT state. Abandon the failed
+	// compiler and retry from a fresh analysis/JIT instance.
+	static_cast<void>(compiler.release());
+	compiler = spu_recompiler_base::make_llvm_recompiler();
+	compiler->init();
+
+	const auto retry_program = analyse_spu_llvm_program(*compiler, program);
+
+	if (retry_program != program)
+	{
+		spu_log.error("[0x%05x] SPU analyser failed during TBL2/TBX2 retry, %u vs %u", retry_program.entry_point, retry_program.data.size(), program.data.size());
+		return nullptr;
+	}
+
+	spu_llvm_compile_context retry_context;
+	spu_llvm_compile_scope scope(retry_context, false);
+
+	const auto result = compiler->compile(spu_program{retry_program});
+
+	if (result)
+	{
+		spu_log.notice("SPU LLVM block 0x%x compiled successfully without TBL2/TBX2.", program.entry_point);
+	}
+	else if (!retry_context.llvm_error.empty())
+	{
+		spu_log.error("LLVM failed to compile SPU block 0x%x without TBL2/TBX2: %s", program.entry_point, retry_context.llvm_error);
+	}
+
+	return result;
+}
+#endif
+
 // Move 4 args for calling native function from a GHC calling convention function
 #if defined(ARCH_X64)
 static u8* move_args_ghc_to_native(u8* raw)
@@ -906,6 +994,15 @@ void spu_cache::initialize(bool build_existing_cache)
 
 		compiler->init();
 
+		auto compile_program = [&](spu_program&& program) -> spu_function_t
+		{
+#ifdef ARCH_ARM64
+			return compile_spu_llvm_with_retry(compiler, program);
+#else
+			return compiler->compile(std::move(program));
+#endif
+		};
+
 		// Counter for error reporting
 		u32 logged_error = 0;
 
@@ -977,7 +1074,7 @@ void spu_cache::initialize(bool build_existing_cache)
 					logged_error++;
 				}
 			}
-			else if (!compiler->compile(std::move(func2)))
+			else if (!compile_program(std::move(func2)))
 			{
 				// Likely, out of JIT memory. Signal to prevent further building.
 				fail_flag |= 1;
@@ -1075,7 +1172,7 @@ void spu_cache::initialize(bool build_existing_cache)
 				const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
 				const u32 prog_size = ::size32(func2.data);
 
-				if (!compiler->compile(std::move(func2)))
+				if (!compile_program(std::move(func2)))
 				{
 					// Likely, out of JIT memory. Signal to prevent further building.
 					fail_flag |= 1;
@@ -2096,7 +2193,12 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 		return;
 	}
 
-	const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
+	auto program = spu.jit->analyse(spu._ptr<u32>(0), spu.pc);
+#ifdef ARCH_ARM64
+	const auto func = compile_spu_llvm_with_retry(spu.jit, program);
+#else
+	const auto func = spu.jit->compile(std::move(program));
+#endif
 
 	if (!func)
 	{
@@ -8903,8 +9005,20 @@ struct spu_llvm_worker
 			{
 				spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
 			}
-			else if (const auto target = compiler->compile(std::move(func2)))
+			else
 			{
+#ifdef ARCH_ARM64
+				const auto target = compile_spu_llvm_with_retry(compiler, func2);
+#else
+				const auto target = compiler->compile(std::move(func2));
+#endif
+
+				if (!target)
+				{
+					spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
+					break;
+				}
+
 				// Redirect old function (TODO: patch in multiple places)
 				const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;
 
@@ -8922,11 +9036,6 @@ struct spu_llvm_worker
 
 				atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
 			}
-			else
-			{
-				spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
-				break;
-			}
 
 			// Clear fake LS
 			std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
index a99762593e..806c53e6d0 100644
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@@ -62,6 +62,16 @@ const extern spu_decoder<spu_iflag> g_spu_iflag;
 
 #ifdef ARCH_ARM64
 #include "Emu/CPU/Backends/AArch64/AArch64JIT.h"
+
+namespace
+{
+	thread_local spu_llvm_compile_context* g_spu_llvm_compile_context = nullptr;
+}
+
+void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept
+{
+	g_spu_llvm_compile_context = context;
+}
 #endif
 
 class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
@@ -1669,6 +1679,15 @@ public:
 			m_hash_start = hash_start;
 		}
 
+#ifdef ARCH_ARM64
+		m_use_tbl2 = !g_spu_llvm_compile_context || g_spu_llvm_compile_context->use_tbl2;
+
+		if (g_spu_llvm_compile_context)
+		{
+			g_spu_llvm_compile_context->llvm_error.clear();
+		}
+#endif
+
 		spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);
 
 		m_pos = func.lower_bound;
@@ -3478,17 +3497,63 @@ public:
 		} _jit_guard;
 #endif
 
-		if (g_cfg.core.spu_debug)
 		{
-			// Testing only
-			m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
-		}
-		else
-		{
-			m_jit.add(std::move(_module));
-		}
+#ifdef ARCH_ARM64
+			const bool recoverable = !!g_spu_llvm_compile_context;
 
-		m_jit.fin();
+			if (recoverable)
+			{
+				bool added = false;
+				std::string& llvm_error = g_spu_llvm_compile_context->llvm_error;
+
+				if (g_cfg.core.spu_debug)
+				{
+					// Testing only
+					added = m_jit.try_add(std::move(_module), m_spurt->get_cache_path() + "llvm/", llvm_error);
+				}
+				else
+				{
+					added = m_jit.try_add(std::move(_module), llvm_error);
+				}
+
+				if (!added || !m_jit.try_fin(llvm_error))
+				{
+					if (add_to_file)
+					{
+						add_loc->cached = 0;
+					}
+
+					return nullptr;
+				}
+			}
+			else
+			{
+				if (g_cfg.core.spu_debug)
+				{
+					// Testing only
+					m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
+				}
+				else
+				{
+					m_jit.add(std::move(_module));
+				}
+
+				m_jit.fin();
+			}
+#else
+			if (g_cfg.core.spu_debug)
+			{
+				// Testing only
+				m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
+			}
+			else
+			{
+				m_jit.add(std::move(_module));
+			}
+
+			m_jit.fin();
+#endif
+		}
 
 		// Register function pointer
 		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h
index fc74bcec90..9432ccf1fc 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@@ -77,6 +77,16 @@ struct spu_program
 	bool operator<(const spu_program& rhs) const noexcept;
 };
 
+#ifdef ARCH_ARM64
+struct spu_llvm_compile_context
+{
+	bool use_tbl2 = true;
+	std::string llvm_error;
+};
+
+void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept;
+#endif
+
 class spu_item
 {
 public: