From 44d4d6b05644bf47a096553801ba4cec4f4942c7 Mon Sep 17 00:00:00 2001 From: Malcolm Date: Sat, 2 May 2026 16:09:26 -0400 Subject: [PATCH 1/2] PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb > - SHUFB from 9 instructions down to 5 > - Though it should be 4 if LLVM would just emit BCAX... --- rpcs3/Emu/CPU/CPUTranslator.h | 106 +++++++++++++++++++++++++++ rpcs3/Emu/Cell/PPUTranslator.cpp | 11 +++ rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 68 +++++++++++++++++ 3 files changed, 185 insertions(+) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index ab2aed8156..d047be63bb 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -4030,6 +4030,112 @@ public: }); } +#ifdef ARCH_ARM64 + template + value_t tbl(T1 a, T2 b) + { + value_t result; + const auto data0 = a.eval(m_ir); + const auto index = b.eval(m_ir); + const auto zeros = llvm::ConstantAggregateZero::get(get_type()); + + if (auto c = llvm::dyn_cast(index)) + { + v128 mask{}; + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b_val = cv->getElementAsInteger(i); + mask._u8[i] = (b_val < 16) ? static_cast(b_val) : static_cast(16); + } + } + + if (cv || llvm::isa(c)) + { + result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(data0, zeros, result.value); + return result; + } + } + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index }); + return result; + } + + template + value_t tbl2(T1 a, T2 b, T3 indices) + { + value_t result; + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto index = indices.eval(m_ir); + + if (auto c = llvm::dyn_cast(index)) + { + v128 mask{}; + v128 bitmask{}; + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b_val = cv->getElementAsInteger(i); + mask._u8[i] = (b_val < 32) ? static_cast(b_val) : static_cast(0); + bitmask._u8[i] = (b_val < 32) ? static_cast(0xFF) : static_cast(0x00); + } + } + else if (llvm::isa(c)) + { + bitmask = v128::from8p(0xFF); + } + + if (cv || llvm::isa(c)) + { + auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); + auto m_ext = m_ir->CreateZExt(m_val, get_type()); + auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext); + + auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&bitmask), 16)); + result.value = m_ir->CreateAnd(lookup, z_mask); + return result; + } + } + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index }); + return result; + } + + template + value_t tbx(T1 fallback, T2 a, T3 indices) + { + value_t result; + const auto v_fallback = fallback.eval(m_ir); + const auto data0 = a.eval(m_ir); + const auto index = indices.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index }); + return result; + } + + template + value_t tbx2(T1 fallback, T2 a, T3 b, T4 indices) + { + value_t result; + const auto v_fallback = fallback.eval(m_ir); + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto index = indices.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index }); + return result; + } +#endif + // (m << 3) >= 0 ? a : b template static auto select_by_bit4(T&& m, U&& a, V&& b) diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 0205715328..9125aa5eeb 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1642,6 +1642,16 @@ void PPUTranslator::VPERM(ppu_opcode_t op) { const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); +#ifdef ARCH_ARM64 + if (op.ra == op.rb) + { + set_vr(op.vd, tbl(a, (~c & 0xf))); + return; + } + + set_vr(op.vd, tbl2(b, a, (~c & 0x1f))); + return; +#else if (op.ra == op.rb) { set_vr(op.vd, pshufb(a, ~c & 0xf)); @@ -1657,6 +1667,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op) const auto i = eval(~c & 0x1f); set_vr(op.vd, select(noncast(c << 3) >= 0, pshufb(a, i), pshufb(b, i))); +#endif } void PPUTranslator::VPKPX(ppu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 927d7ac187..a99762593e 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6757,6 +6757,73 @@ public: const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); +#ifdef ARCH_ARM64 + if (auto [ok, as] = match_expr(a, byteswap(match())); ok) + { + if (auto [ok, bs] = match_expr(b, byteswap(match())); ok) + { + if (op.ra == op.rb) + { + if (perm_only) + { + const auto cm = eval(c & 0x0f); + set_vr(op.rt4, tbl(as, cm)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x8f); + set_vr(op.rt4, tbx(x, as, cm)); + return; + } + + if (perm_only) + { + const auto cm = eval(c & 0x1f); + set_vr(op.rt4, tbl2(as, bs, cm)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x9f); + set_vr(op.rt4, tbx2(x, as, bs, cm)); + return; + } + } + + if (op.ra == op.rb && !m_interp_magn) + { + if (perm_only) + { + const auto cm = eval(c & 0x0f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbl(a, cr)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x8f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbx(x, a, cr)); + return; + } + + if (perm_only) + { + const auto cm = eval(c & 0x9f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbl2(a, b, cr)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + // AND should be before XOR so that llvm can combine them into BCAX + // Though for some reason it doesn't seem to be doing that. + const auto cm = eval(c & ~0x60); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbx2(x, a, b, cr)); + return; +#else // Data with swapped endian from a load instruction if (auto [ok, as] = match_expr(a, byteswap(match())); ok) { @@ -6900,6 +6967,7 @@ public: set_vr(op.rt4, select_by_bit4(cr, ax, bx)); else set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x); +#endif } void MPYA(spu_opcode_t op) From 9bd65faa85e7d11706f4c4091709785427d3c53b Mon Sep 17 00:00:00 2001 From: Malcolm Date: Sat, 2 May 2026 16:10:24 -0400 Subject: [PATCH 2/2] SPU LLVM: Retry ARM64 TBL2 register scavenger failures - Some SPU programs inexplicably fail to compile when TBL2/TBX2 are used. - As an insane workaround, first try to compile with TBL2/TBX2, if LLVM crashes while compiling, try to compile the same program without TBL2/TBX2. --- Utilities/JIT.h | 9 ++ Utilities/JITLLVM.cpp | 106 +++++++++++++++++++++ Utilities/Thread.cpp | 26 +++++ Utilities/Thread.h | 3 + rpcs3/Emu/CPU/CPUTranslator.h | 74 +++++++++----- rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 127 +++++++++++++++++++++++-- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 83 ++++++++++++++-- rpcs3/Emu/Cell/SPURecompiler.h | 10 ++ 8 files changed, 394 insertions(+), 44 deletions(-) diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 86fc72ed55..d18f795563 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -543,9 +543,15 @@ public: // Add module (path to obj cache dir) void add(std::unique_ptr _module, const std::string& path); + // Returns false after LLVM fatal recovery. The compiler must be discarded. + bool try_add(std::unique_ptr _module, const std::string& path, std::string& error); + // Add module (not cached) void add(std::unique_ptr _module); + // Returns false after LLVM fatal recovery. The compiler must be discarded. + bool try_add(std::unique_ptr _module, std::string& error); + // Add object (path to obj file) bool add(const std::string& path); @@ -558,6 +564,9 @@ public: // Finalize void fin(); + // Returns false after LLVM fatal recovery. The compiler must be discarded. + bool try_fin(std::string& error); + // Get compiled function address u64 get(const std::string& name); diff --git a/Utilities/JITLLVM.cpp b/Utilities/JITLLVM.cpp index 34e1572185..31fb769c5a 100644 --- a/Utilities/JITLLVM.cpp +++ b/Utilities/JITLLVM.cpp @@ -12,6 +12,10 @@ #include +#if defined(__APPLE__) +#include +#endif + LOG_CHANNEL(jit_log, "JIT"); #ifdef LLVM_AVAILABLE @@ -50,6 +54,44 @@ LOG_CHANNEL(jit_log, "JIT"); #include "Emu/CPU/Backends/AArch64/AArch64Common.h" #endif +namespace +{ + thread_local std::string* g_llvm_fatal_message = nullptr; + + template + bool run_recoverable_llvm(F&& func, std::string& error) + { + error.clear(); + + // Run LLVM codegen in a disposable thread. If LLVM invokes the fatal + // handler, only this helper thread exits. + named_thread worker("LLVM JIT", [&]() + { +#if defined(__APPLE__) + pthread_jit_write_protect_np(false); +#endif + g_llvm_fatal_message = &error; + + std::forward(func)(); + + g_llvm_fatal_message = nullptr; +#if defined(__APPLE__) + pthread_jit_write_protect_np(true); +#endif + }); + + worker(); + const bool result = static_cast(worker) == thread_state::finished; + + if (!result && error.empty()) + { + error = "LLVM crash recovery invoked"; + } + + return result; + } +} + const bool jit_initialize = []() -> bool { llvm::InitializeNativeTarget(); @@ -649,6 +691,13 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co llvm::install_fatal_error_handler([](void*, const char* msg, bool) { const std::string_view out = msg ? msg : ""; + + if (g_llvm_fatal_message) + { + *g_llvm_fatal_message = out; + thread_ctrl::silent_exit(); + } + fmt::throw_exception("LLVM Emergency Exit Invoked: '%s'", out); }, nullptr); @@ -788,6 +837,33 @@ void jit_compiler::add(std::unique_ptr _module, const std::string& } } +bool jit_compiler::try_add(std::unique_ptr _module, const std::string& path, std::string& error) +{ + ObjectCache cache{path, this}; + m_engine->setObjectCache(&cache); + + const auto ptr = _module.get(); + m_engine->addModule(std::move(_module)); + + if (!run_recoverable_llvm([&]() + { + m_engine->generateCodeForModule(ptr); + }, error)) + { + return false; + } + + m_engine->setObjectCache(nullptr); + + for (auto& func : ptr->functions()) + { + // Delete IR to lower memory consumption + func.deleteBody(); + } + + return true; +} + void jit_compiler::add(std::unique_ptr _module) { const auto ptr = _module.get(); @@ -801,6 +877,28 @@ void jit_compiler::add(std::unique_ptr _module) } } +bool jit_compiler::try_add(std::unique_ptr _module, std::string& error) +{ + const auto ptr = _module.get(); + m_engine->addModule(std::move(_module)); + + if (!run_recoverable_llvm([&]() + { + m_engine->generateCodeForModule(ptr); + }, error)) + { + return false; + } + + for (auto& func : ptr->functions()) + { + // Delete IR to lower memory consumption + func.deleteBody(); + } + + return true; +} + bool jit_compiler::add(const std::string& path) { auto cache = ObjectCache::load(path); @@ -852,6 +950,14 @@ void jit_compiler::fin() m_engine->finalizeObject(); } +bool jit_compiler::try_fin(std::string& error) +{ + return run_recoverable_llvm([&]() + { + m_engine->finalizeObject(); + }, error); +} + u64 jit_compiler::get(const std::string& name) { return m_engine->getGlobalValueAddress(name); diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index 57d7446daf..fee498cc0e 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -2988,6 +2988,32 @@ void thread_ctrl::set_name(std::string name) report_fatal_error(reason); } +void thread_ctrl::silent_exit() noexcept +{ + if (const auto _this = g_tls_this_thread) + { + g_tls_error_callback(); + + u64 _self = _this->finalize(thread_state::errored); + + if (_self == umax) + { + // Unused, detached thread support remnant + delete _this; + } + + thread_base::finalize(umax); + } + +#ifdef _WIN32 + _endthreadex(0); +#else + pthread_exit(nullptr); +#endif + + std::abort(); +} + void thread_ctrl::detect_cpu_layout() { if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic)) diff --git a/Utilities/Thread.h b/Utilities/Thread.h index bafcea0b9f..b08f05fe82 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -315,6 +315,9 @@ public: // Exit. [[noreturn]] static void emergency_exit(std::string_view reason); + // Exit the current named thread as errored without reporting a fatal error. + [[noreturn]] static void silent_exit() noexcept; + // Get current thread (may be nullptr) static thread_base* get_current() { diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index d047be63bb..7a037b2a00 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3120,6 +3120,9 @@ protected: // ARMv8 SDOT/UDOT bool m_use_dotprod = false; + + // Allow direct TBL2/TBX2 emission. + bool m_use_tbl2 = true; #else // Allow FMA bool m_use_fma = false; @@ -4074,39 +4077,49 @@ public: const auto data1 = b.eval(m_ir); const auto index = indices.eval(m_ir); - if (auto c = llvm::dyn_cast(index)) + if (m_use_tbl2) { - v128 mask{}; - v128 bitmask{}; - const auto cv = llvm::dyn_cast(c); - - if (cv) + if (auto c = llvm::dyn_cast(index)) { - for (u32 i = 0; i < 16; i++) + v128 mask{}; + v128 bitmask{}; + const auto cv = llvm::dyn_cast(c); + + if (cv) { - const u64 b_val = cv->getElementAsInteger(i); - mask._u8[i] = (b_val < 32) ? static_cast(b_val) : static_cast(0); - bitmask._u8[i] = (b_val < 32) ? static_cast(0xFF) : static_cast(0x00); + for (u32 i = 0; i < 16; i++) + { + const u64 b_val = cv->getElementAsInteger(i); + mask._u8[i] = (b_val < 32) ? static_cast(b_val) : static_cast(0); + bitmask._u8[i] = (b_val < 32) ? static_cast(0xFF) : static_cast(0x00); + } + } + else if (llvm::isa(c)) + { + bitmask = v128::from8p(0xFF); + } + + if (cv || llvm::isa(c)) + { + auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); + auto m_ext = m_ir->CreateZExt(m_val, get_type()); + auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext); + + auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&bitmask), 16)); + result.value = m_ir->CreateAnd(lookup, z_mask); + return result; } } - else if (llvm::isa(c)) - { - bitmask = v128::from8p(0xFF); - } - if (cv || llvm::isa(c)) - { - auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); - auto m_ext = m_ir->CreateZExt(m_val, get_type()); - auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext); - - auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&bitmask), 16)); - result.value = m_ir->CreateAnd(lookup, z_mask); - return result; - } + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index }); + return result; } - result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index }); + const auto data0_lookup = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index }); + const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type(), 16)); + const auto data1_lookup = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl1), { data1, data1_index }); + + result.value = m_ir->CreateOr(data0_lookup, data1_lookup); return result; } @@ -4131,7 +4144,16 @@ public: const auto data1 = b.eval(m_ir); const auto index = indices.eval(m_ir); - result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index }); + if (m_use_tbl2) + { + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index }); + return result; + } + + const auto first_lookup = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index }); + const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type(), 16)); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx1), { first_lookup, data1, data1_index }); return result; } #endif diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index dffca21cae..1f02d6291b 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -82,6 +82,94 @@ void fmt_class_string::format(std::strin }); } +#ifdef ARCH_ARM64 +constexpr const char s_spu_llvm_reg_scavenge_error[] = "Cannot scavenge register without an emergency spill slot"; + +class spu_llvm_compile_scope +{ +public: + spu_llvm_compile_scope(spu_llvm_compile_context& context, bool use_tbl2) noexcept + { + context = {}; + context.use_tbl2 = use_tbl2; + spu_llvm_set_compile_context(&context); + } + + ~spu_llvm_compile_scope() noexcept + { + spu_llvm_set_compile_context(nullptr); + } +}; + +static spu_program analyse_spu_llvm_program(spu_recompiler_base& compiler, const spu_program& program) +{ + std::vector> ls(SPU_LS_SIZE / sizeof(be_t)); + + for (u32 i = 0, pos = program.lower_bound; i < program.data.size(); i++, pos += 4) + { + ls[pos / 4] = std::bit_cast>(program.data[i]); + } + + return compiler.analyse(ls.data(), program.entry_point); +} + +static spu_function_t compile_spu_llvm_with_retry(std::unique_ptr& compiler, const spu_program& program) +{ + spu_llvm_compile_context context; + + { + spu_llvm_compile_scope scope(context, true); + + if (const auto result = compiler->compile(spu_program{program})) + { + return result; + } + } + + if (context.llvm_error.find(s_spu_llvm_reg_scavenge_error) == std::string::npos) + { + if (!context.llvm_error.empty()) + { + spu_log.error("LLVM failed to compile SPU block 0x%x: %s", program.entry_point, context.llvm_error); + } + + return nullptr; + } + + spu_log.warning("LLVM failed to compile SPU block 0x%x with TBL2/TBX2: %s. Retrying without TBL2/TBX2.", program.entry_point, context.llvm_error); + + // LLVM fatal recovery does not unwind MCJIT state. Abandon the failed + // compiler and retry from a fresh analysis/JIT instance. + static_cast(compiler.release()); + compiler = spu_recompiler_base::make_llvm_recompiler(); + compiler->init(); + + const auto retry_program = analyse_spu_llvm_program(*compiler, program); + + if (retry_program != program) + { + spu_log.error("[0x%05x] SPU analyser failed during TBL2/TBX2 retry, %u vs %u", retry_program.entry_point, retry_program.data.size(), program.data.size()); + return nullptr; + } + + spu_llvm_compile_context retry_context; + spu_llvm_compile_scope scope(retry_context, false); + + const auto result = compiler->compile(spu_program{retry_program}); + + if (result) + { + spu_log.notice("SPU LLVM block 0x%x compiled successfully without TBL2/TBX2.", program.entry_point); + } + else if (!retry_context.llvm_error.empty()) + { + spu_log.error("LLVM failed to compile SPU block 0x%x without TBL2/TBX2: %s", program.entry_point, retry_context.llvm_error); + } + + return result; +} +#endif + // Move 4 args for calling native function from a GHC calling convention function #if defined(ARCH_X64) static u8* move_args_ghc_to_native(u8* raw) @@ -906,6 +994,15 @@ void spu_cache::initialize(bool build_existing_cache) compiler->init(); + auto compile_program = [&](spu_program&& program) -> spu_function_t + { +#ifdef ARCH_ARM64 + return compile_spu_llvm_with_retry(compiler, program); +#else + return compiler->compile(std::move(program)); +#endif + }; + // Counter for error reporting u32 logged_error = 0; @@ -977,7 +1074,7 @@ void spu_cache::initialize(bool build_existing_cache) logged_error++; } } - else if (!compiler->compile(std::move(func2))) + else if (!compile_program(std::move(func2))) { // Likely, out of JIT memory. Signal to prevent further building. fail_flag |= 1; @@ -1075,7 +1172,7 @@ void spu_cache::initialize(bool build_existing_cache) const u32 last_inst = std::bit_cast>(func2.data.back()); const u32 prog_size = ::size32(func2.data); - if (!compiler->compile(std::move(func2))) + if (!compile_program(std::move(func2))) { // Likely, out of JIT memory. Signal to prevent further building. fail_flag |= 1; @@ -2096,7 +2193,12 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) return; } - const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr(0), spu.pc)); + auto program = spu.jit->analyse(spu._ptr(0), spu.pc); +#ifdef ARCH_ARM64 + const auto func = compile_spu_llvm_with_retry(spu.jit, program); +#else + const auto func = spu.jit->compile(std::move(program)); +#endif if (!func) { @@ -8903,8 +9005,20 @@ struct spu_llvm_worker { spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0); } - else if (const auto target = compiler->compile(std::move(func2))) + else { +#ifdef ARCH_ARM64 + const auto target = compile_spu_llvm_with_retry(compiler, func2); +#else + const auto target = compiler->compile(std::move(func2)); +#endif + + if (!target) + { + spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point); + break; + } + // Redirect old function (TODO: patch in multiple places) const s64 rel = reinterpret_cast(target) - prog->first - 5; @@ -8922,11 +9036,6 @@ struct spu_llvm_worker atomic_storage::release(*reinterpret_cast(prog->first), result); } - else - { - spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point); - break; - } // Clear fake LS std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1)); diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index a99762593e..806c53e6d0 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -62,6 +62,16 @@ const extern spu_decoder g_spu_iflag; #ifdef ARCH_ARM64 #include "Emu/CPU/Backends/AArch64/AArch64JIT.h" + +namespace +{ + thread_local spu_llvm_compile_context* g_spu_llvm_compile_context = nullptr; +} + +void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept +{ + g_spu_llvm_compile_context = context; +} #endif class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator @@ -1669,6 +1679,15 @@ public: m_hash_start = hash_start; } +#ifdef ARCH_ARM64 + m_use_tbl2 = !g_spu_llvm_compile_context || g_spu_llvm_compile_context->use_tbl2; + + if (g_spu_llvm_compile_context) + { + g_spu_llvm_compile_context->llvm_error.clear(); + } +#endif + spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash); m_pos = func.lower_bound; @@ -3478,17 +3497,63 @@ public: } _jit_guard; #endif - if (g_cfg.core.spu_debug) { - // Testing only - m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); - } - else - { - m_jit.add(std::move(_module)); - } +#ifdef ARCH_ARM64 + const bool recoverable = !!g_spu_llvm_compile_context; - m_jit.fin(); + if (recoverable) + { + bool added = false; + std::string& llvm_error = g_spu_llvm_compile_context->llvm_error; + + if (g_cfg.core.spu_debug) + { + // Testing only + added = m_jit.try_add(std::move(_module), m_spurt->get_cache_path() + "llvm/", llvm_error); + } + else + { + added = m_jit.try_add(std::move(_module), llvm_error); + } + + if (!added || !m_jit.try_fin(llvm_error)) + { + if (add_to_file) + { + add_loc->cached = 0; + } + + return nullptr; + } + } + else + { + if (g_cfg.core.spu_debug) + { + // Testing only + m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); + } + else + { + m_jit.add(std::move(_module)); + } + + m_jit.fin(); + } +#else + if (g_cfg.core.spu_debug) + { + // Testing only + m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); + } + else + { + m_jit.add(std::move(_module)); + } + + m_jit.fin(); +#endif + } // Register function pointer const spu_function_t fn = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index fc74bcec90..9432ccf1fc 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -77,6 +77,16 @@ struct spu_program bool operator<(const spu_program& rhs) const noexcept; }; +#ifdef ARCH_ARM64 +struct spu_llvm_compile_context +{ + bool use_tbl2 = true; + std::string llvm_error; +}; + +void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept; +#endif + class spu_item { public: