SPU LLVM: Retry ARM64 TBL2 register scavenger failures

- Some SPU programs inexplicably fail to compile when TBL2/TBX2 are used.
- As an insane workaround, first try to compile with TBL2/TBX2, if LLVM crashes while compiling, try to compile the same program without TBL2/TBX2.
This commit is contained in:
Malcolm 2026-05-02 16:10:24 -04:00
parent 44d4d6b056
commit 9bd65faa85
8 changed files with 394 additions and 44 deletions

View File

@ -543,9 +543,15 @@ public:
// Add module (path to obj cache dir)
void add(std::unique_ptr<llvm::Module> _module, const std::string& path);
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error);
// Add module (not cached)
void add(std::unique_ptr<llvm::Module> _module);
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_add(std::unique_ptr<llvm::Module> _module, std::string& error);
// Add object (path to obj file)
bool add(const std::string& path);
@ -558,6 +564,9 @@ public:
// Finalize
void fin();
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_fin(std::string& error);
// Get compiled function address
u64 get(const std::string& name);

View File

@ -12,6 +12,10 @@
#include <charconv>
#if defined(__APPLE__)
#include <pthread.h>
#endif
LOG_CHANNEL(jit_log, "JIT");
#ifdef LLVM_AVAILABLE
@ -50,6 +54,44 @@ LOG_CHANNEL(jit_log, "JIT");
#include "Emu/CPU/Backends/AArch64/AArch64Common.h"
#endif
namespace
{
thread_local std::string* g_llvm_fatal_message = nullptr;
template <typename F>
bool run_recoverable_llvm(F&& func, std::string& error)
{
error.clear();
// Run LLVM codegen in a disposable thread. If LLVM invokes the fatal
// handler, only this helper thread exits.
named_thread worker("LLVM JIT", [&]()
{
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
g_llvm_fatal_message = &error;
std::forward<F>(func)();
g_llvm_fatal_message = nullptr;
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
});
worker();
const bool result = static_cast<thread_state>(worker) == thread_state::finished;
if (!result && error.empty())
{
error = "LLVM crash recovery invoked";
}
return result;
}
}
const bool jit_initialize = []() -> bool
{
llvm::InitializeNativeTarget();
@ -649,6 +691,13 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
llvm::install_fatal_error_handler([](void*, const char* msg, bool)
{
const std::string_view out = msg ? msg : "";
if (g_llvm_fatal_message)
{
*g_llvm_fatal_message = out;
thread_ctrl::silent_exit();
}
fmt::throw_exception("LLVM Emergency Exit Invoked: '%s'", out);
}, nullptr);
@ -788,6 +837,33 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module, const std::string&
}
}
bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error)
{
ObjectCache cache{path, this};
m_engine->setObjectCache(&cache);
const auto ptr = _module.get();
m_engine->addModule(std::move(_module));
if (!run_recoverable_llvm([&]()
{
m_engine->generateCodeForModule(ptr);
}, error))
{
return false;
}
m_engine->setObjectCache(nullptr);
for (auto& func : ptr->functions())
{
// Delete IR to lower memory consumption
func.deleteBody();
}
return true;
}
void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
{
const auto ptr = _module.get();
@ -801,6 +877,28 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
}
}
bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, std::string& error)
{
const auto ptr = _module.get();
m_engine->addModule(std::move(_module));
if (!run_recoverable_llvm([&]()
{
m_engine->generateCodeForModule(ptr);
}, error))
{
return false;
}
for (auto& func : ptr->functions())
{
// Delete IR to lower memory consumption
func.deleteBody();
}
return true;
}
bool jit_compiler::add(const std::string& path)
{
auto cache = ObjectCache::load(path);
@ -852,6 +950,14 @@ void jit_compiler::fin()
m_engine->finalizeObject();
}
bool jit_compiler::try_fin(std::string& error)
{
return run_recoverable_llvm([&]()
{
m_engine->finalizeObject();
}, error);
}
u64 jit_compiler::get(const std::string& name)
{
return m_engine->getGlobalValueAddress(name);

View File

@ -2988,6 +2988,32 @@ void thread_ctrl::set_name(std::string name)
report_fatal_error(reason);
}
void thread_ctrl::silent_exit() noexcept
{
if (const auto _this = g_tls_this_thread)
{
g_tls_error_callback();
u64 _self = _this->finalize(thread_state::errored);
if (_self == umax)
{
// Unused, detached thread support remnant
delete _this;
}
thread_base::finalize(umax);
}
#ifdef _WIN32
_endthreadex(0);
#else
pthread_exit(nullptr);
#endif
std::abort();
}
void thread_ctrl::detect_cpu_layout()
{
if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))

View File

@ -315,6 +315,9 @@ public:
// Exit.
[[noreturn]] static void emergency_exit(std::string_view reason);
// Exit the current named thread as errored without reporting a fatal error.
[[noreturn]] static void silent_exit() noexcept;
// Get current thread (may be nullptr)
static thread_base* get_current()
{

View File

@ -3120,6 +3120,9 @@ protected:
// ARMv8 SDOT/UDOT
bool m_use_dotprod = false;
// Allow direct TBL2/TBX2 emission.
bool m_use_tbl2 = true;
#else
// Allow FMA
bool m_use_fma = false;
@ -4074,39 +4077,49 @@ public:
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
if (m_use_tbl2)
{
v128 mask{};
v128 bitmask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
for (u32 i = 0; i < 16; i++)
v128 mask{};
v128 bitmask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
for (u32 i = 0; i < 16; i++)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
}
}
else if (llvm::isa<llvm::ConstantAggregateZero>(c))
{
bitmask = v128::from8p(0xFF);
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
result.value = m_ir->CreateAnd(lookup, z_mask);
return result;
}
}
else if (llvm::isa<llvm::ConstantAggregateZero>(c))
{
bitmask = v128::from8p(0xFF);
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
result.value = m_ir->CreateAnd(lookup, z_mask);
return result;
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
return result;
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
const auto data0_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
const auto data1_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data1, data1_index });
result.value = m_ir->CreateOr(data0_lookup, data1_lookup);
return result;
}
@ -4131,7 +4144,16 @@ public:
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
if (m_use_tbl2)
{
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
return result;
}
const auto first_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { first_lookup, data1, data1_index });
return result;
}
#endif

View File

@ -82,6 +82,94 @@ void fmt_class_string<spu_recompiler_base::compare_direction>::format(std::strin
});
}
#ifdef ARCH_ARM64
constexpr const char s_spu_llvm_reg_scavenge_error[] = "Cannot scavenge register without an emergency spill slot";
class spu_llvm_compile_scope
{
public:
spu_llvm_compile_scope(spu_llvm_compile_context& context, bool use_tbl2) noexcept
{
context = {};
context.use_tbl2 = use_tbl2;
spu_llvm_set_compile_context(&context);
}
~spu_llvm_compile_scope() noexcept
{
spu_llvm_set_compile_context(nullptr);
}
};
static spu_program analyse_spu_llvm_program(spu_recompiler_base& compiler, const spu_program& program)
{
std::vector<be_t<u32>> ls(SPU_LS_SIZE / sizeof(be_t<u32>));
for (u32 i = 0, pos = program.lower_bound; i < program.data.size(); i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(program.data[i]);
}
return compiler.analyse(ls.data(), program.entry_point);
}
static spu_function_t compile_spu_llvm_with_retry(std::unique_ptr<spu_recompiler_base>& compiler, const spu_program& program)
{
spu_llvm_compile_context context;
{
spu_llvm_compile_scope scope(context, true);
if (const auto result = compiler->compile(spu_program{program}))
{
return result;
}
}
if (context.llvm_error.find(s_spu_llvm_reg_scavenge_error) == std::string::npos)
{
if (!context.llvm_error.empty())
{
spu_log.error("LLVM failed to compile SPU block 0x%x: %s", program.entry_point, context.llvm_error);
}
return nullptr;
}
spu_log.warning("LLVM failed to compile SPU block 0x%x with TBL2/TBX2: %s. Retrying without TBL2/TBX2.", program.entry_point, context.llvm_error);
// LLVM fatal recovery does not unwind MCJIT state. Abandon the failed
// compiler and retry from a fresh analysis/JIT instance.
static_cast<void>(compiler.release());
compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();
const auto retry_program = analyse_spu_llvm_program(*compiler, program);
if (retry_program != program)
{
spu_log.error("[0x%05x] SPU analyser failed during TBL2/TBX2 retry, %u vs %u", retry_program.entry_point, retry_program.data.size(), program.data.size());
return nullptr;
}
spu_llvm_compile_context retry_context;
spu_llvm_compile_scope scope(retry_context, false);
const auto result = compiler->compile(spu_program{retry_program});
if (result)
{
spu_log.notice("SPU LLVM block 0x%x compiled successfully without TBL2/TBX2.", program.entry_point);
}
else if (!retry_context.llvm_error.empty())
{
spu_log.error("LLVM failed to compile SPU block 0x%x without TBL2/TBX2: %s", program.entry_point, retry_context.llvm_error);
}
return result;
}
#endif
// Move 4 args for calling native function from a GHC calling convention function
#if defined(ARCH_X64)
static u8* move_args_ghc_to_native(u8* raw)
@ -906,6 +994,15 @@ void spu_cache::initialize(bool build_existing_cache)
compiler->init();
auto compile_program = [&](spu_program&& program) -> spu_function_t
{
#ifdef ARCH_ARM64
return compile_spu_llvm_with_retry(compiler, program);
#else
return compiler->compile(std::move(program));
#endif
};
// Counter for error reporting
u32 logged_error = 0;
@ -977,7 +1074,7 @@ void spu_cache::initialize(bool build_existing_cache)
logged_error++;
}
}
else if (!compiler->compile(std::move(func2)))
else if (!compile_program(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
@ -1075,7 +1172,7 @@ void spu_cache::initialize(bool build_existing_cache)
const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
const u32 prog_size = ::size32(func2.data);
if (!compiler->compile(std::move(func2)))
if (!compile_program(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
@ -2096,7 +2193,12 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
return;
}
const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
auto program = spu.jit->analyse(spu._ptr<u32>(0), spu.pc);
#ifdef ARCH_ARM64
const auto func = compile_spu_llvm_with_retry(spu.jit, program);
#else
const auto func = spu.jit->compile(std::move(program));
#endif
if (!func)
{
@ -8903,8 +9005,20 @@ struct spu_llvm_worker
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (const auto target = compiler->compile(std::move(func2)))
else
{
#ifdef ARCH_ARM64
const auto target = compile_spu_llvm_with_retry(compiler, func2);
#else
const auto target = compiler->compile(std::move(func2));
#endif
if (!target)
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
break;
}
// Redirect old function (TODO: patch in multiple places)
const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;
@ -8922,11 +9036,6 @@ struct spu_llvm_worker
atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
}
else
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
break;
}
// Clear fake LS
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));

View File

@ -62,6 +62,16 @@ const extern spu_decoder<spu_iflag> g_spu_iflag;
#ifdef ARCH_ARM64
#include "Emu/CPU/Backends/AArch64/AArch64JIT.h"
namespace
{
thread_local spu_llvm_compile_context* g_spu_llvm_compile_context = nullptr;
}
void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept
{
g_spu_llvm_compile_context = context;
}
#endif
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
@ -1669,6 +1679,15 @@ public:
m_hash_start = hash_start;
}
#ifdef ARCH_ARM64
m_use_tbl2 = !g_spu_llvm_compile_context || g_spu_llvm_compile_context->use_tbl2;
if (g_spu_llvm_compile_context)
{
g_spu_llvm_compile_context->llvm_error.clear();
}
#endif
spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);
m_pos = func.lower_bound;
@ -3478,17 +3497,63 @@ public:
} _jit_guard;
#endif
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
#ifdef ARCH_ARM64
const bool recoverable = !!g_spu_llvm_compile_context;
m_jit.fin();
if (recoverable)
{
bool added = false;
std::string& llvm_error = g_spu_llvm_compile_context->llvm_error;
if (g_cfg.core.spu_debug)
{
// Testing only
added = m_jit.try_add(std::move(_module), m_spurt->get_cache_path() + "llvm/", llvm_error);
}
else
{
added = m_jit.try_add(std::move(_module), llvm_error);
}
if (!added || !m_jit.try_fin(llvm_error))
{
if (add_to_file)
{
add_loc->cached = 0;
}
return nullptr;
}
}
else
{
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
m_jit.fin();
}
#else
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
m_jit.fin();
#endif
}
// Register function pointer
const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));

View File

@ -77,6 +77,16 @@ struct spu_program
bool operator<(const spu_program& rhs) const noexcept;
};
#ifdef ARCH_ARM64
struct spu_llvm_compile_context
{
bool use_tbl2 = true;
std::string llvm_error;
};
void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept;
#endif
class spu_item
{
public: