This commit is contained in:
Whatcookie 2026-05-12 19:24:24 +02:00 committed by GitHub
commit 80970d092a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 553 additions and 18 deletions

View File

@ -543,9 +543,15 @@ public:
// Add module (path to obj cache dir)
void add(std::unique_ptr<llvm::Module> _module, const std::string& path);
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error);
// Add module (not cached)
void add(std::unique_ptr<llvm::Module> _module);
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_add(std::unique_ptr<llvm::Module> _module, std::string& error);
// Add object (path to obj file)
bool add(const std::string& path);
@ -558,6 +564,9 @@ public:
// Finalize
void fin();
// Returns false after LLVM fatal recovery. The compiler must be discarded.
bool try_fin(std::string& error);
// Get compiled function address
u64 get(const std::string& name);

View File

@ -12,6 +12,10 @@
#include <charconv>
#if defined(__APPLE__)
#include <pthread.h>
#endif
LOG_CHANNEL(jit_log, "JIT");
#ifdef LLVM_AVAILABLE
@ -50,6 +54,44 @@ LOG_CHANNEL(jit_log, "JIT");
#include "Emu/CPU/Backends/AArch64/AArch64Common.h"
#endif
namespace
{
thread_local std::string* g_llvm_fatal_message = nullptr;
template <typename F>
bool run_recoverable_llvm(F&& func, std::string& error)
{
error.clear();
// Run LLVM codegen in a disposable thread. If LLVM invokes the fatal
// handler, only this helper thread exits.
named_thread worker("LLVM JIT", [&]()
{
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
g_llvm_fatal_message = &error;
std::forward<F>(func)();
g_llvm_fatal_message = nullptr;
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
});
worker();
const bool result = static_cast<thread_state>(worker) == thread_state::finished;
if (!result && error.empty())
{
error = "LLVM crash recovery invoked";
}
return result;
}
}
const bool jit_initialize = []() -> bool
{
llvm::InitializeNativeTarget();
@ -649,6 +691,13 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
llvm::install_fatal_error_handler([](void*, const char* msg, bool)
{
const std::string_view out = msg ? msg : "";
if (g_llvm_fatal_message)
{
*g_llvm_fatal_message = out;
thread_ctrl::silent_exit();
}
fmt::throw_exception("LLVM Emergency Exit Invoked: '%s'", out);
}, nullptr);
@ -788,6 +837,33 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module, const std::string&
}
}
bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, const std::string& path, std::string& error)
{
ObjectCache cache{path, this};
m_engine->setObjectCache(&cache);
const auto ptr = _module.get();
m_engine->addModule(std::move(_module));
if (!run_recoverable_llvm([&]()
{
m_engine->generateCodeForModule(ptr);
}, error))
{
return false;
}
m_engine->setObjectCache(nullptr);
for (auto& func : ptr->functions())
{
// Delete IR to lower memory consumption
func.deleteBody();
}
return true;
}
void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
{
const auto ptr = _module.get();
@ -801,6 +877,28 @@ void jit_compiler::add(std::unique_ptr<llvm::Module> _module)
}
}
bool jit_compiler::try_add(std::unique_ptr<llvm::Module> _module, std::string& error)
{
const auto ptr = _module.get();
m_engine->addModule(std::move(_module));
if (!run_recoverable_llvm([&]()
{
m_engine->generateCodeForModule(ptr);
}, error))
{
return false;
}
for (auto& func : ptr->functions())
{
// Delete IR to lower memory consumption
func.deleteBody();
}
return true;
}
bool jit_compiler::add(const std::string& path)
{
auto cache = ObjectCache::load(path);
@ -852,6 +950,14 @@ void jit_compiler::fin()
m_engine->finalizeObject();
}
bool jit_compiler::try_fin(std::string& error)
{
return run_recoverable_llvm([&]()
{
m_engine->finalizeObject();
}, error);
}
u64 jit_compiler::get(const std::string& name)
{
return m_engine->getGlobalValueAddress(name);

View File

@ -2988,6 +2988,32 @@ void thread_ctrl::set_name(std::string name)
report_fatal_error(reason);
}
void thread_ctrl::silent_exit() noexcept
{
if (const auto _this = g_tls_this_thread)
{
g_tls_error_callback();
u64 _self = _this->finalize(thread_state::errored);
if (_self == umax)
{
// Unused, detached thread support remnant
delete _this;
}
thread_base::finalize(umax);
}
#ifdef _WIN32
_endthreadex(0);
#else
pthread_exit(nullptr);
#endif
std::abort();
}
void thread_ctrl::detect_cpu_layout()
{
if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))

View File

@ -315,6 +315,9 @@ public:
// Exit.
[[noreturn]] static void emergency_exit(std::string_view reason);
// Exit the current named thread as errored without reporting a fatal error.
[[noreturn]] static void silent_exit() noexcept;
// Get current thread (may be nullptr)
static thread_base* get_current()
{

View File

@ -3120,6 +3120,9 @@ protected:
// ARMv8 SDOT/UDOT
bool m_use_dotprod = false;
// Allow direct TBL2/TBX2 emission.
bool m_use_tbl2 = true;
#else
// Allow FMA
bool m_use_fma = false;
@ -4030,6 +4033,131 @@ public:
});
}
#ifdef ARCH_ARM64
template <typename T1, typename T2>
value_t<u8[16]> tbl(T1 a, T2 b)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto index = b.eval(m_ir);
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
v128 mask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
for (u32 i = 0; i < 16; i++)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
}
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
return result;
}
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
if (m_use_tbl2)
{
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
v128 mask{};
v128 bitmask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
for (u32 i = 0; i < 16; i++)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
}
}
else if (llvm::isa<llvm::ConstantAggregateZero>(c))
{
bitmask = v128::from8p(0xFF);
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
result.value = m_ir->CreateAnd(lookup, z_mask);
return result;
}
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
return result;
}
const auto data0_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
const auto data1_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data1, data1_index });
result.value = m_ir->CreateOr(data0_lookup, data1_lookup);
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
{
value_t<u8[16]> result;
const auto v_fallback = fallback.eval(m_ir);
const auto data0 = a.eval(m_ir);
const auto index = indices.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
return result;
}
template <typename T1, typename T2, typename T3, typename T4>
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
{
value_t<u8[16]> result;
const auto v_fallback = fallback.eval(m_ir);
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
if (m_use_tbl2)
{
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
return result;
}
const auto first_lookup = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
const auto data1_index = m_ir->CreateSub(index, llvm::ConstantInt::get(get_type<u8[16]>(), 16));
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { first_lookup, data1, data1_index });
return result;
}
#endif
// (m << 3) >= 0 ? a : b
template <typename T, typename U, typename V>
static auto select_by_bit4(T&& m, U&& a, V&& b)

View File

@ -1642,6 +1642,16 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
{
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
#ifdef ARCH_ARM64
if (op.ra == op.rb)
{
set_vr(op.vd, tbl(a, (~c & 0xf)));
return;
}
set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
return;
#else
if (op.ra == op.rb)
{
set_vr(op.vd, pshufb(a, ~c & 0xf));
@ -1657,6 +1667,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
const auto i = eval(~c & 0x1f);
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
#endif
}
void PPUTranslator::VPKPX(ppu_opcode_t op)

View File

@ -82,6 +82,94 @@ void fmt_class_string<spu_recompiler_base::compare_direction>::format(std::strin
});
}
#ifdef ARCH_ARM64
constexpr const char s_spu_llvm_reg_scavenge_error[] = "Cannot scavenge register without an emergency spill slot";
class spu_llvm_compile_scope
{
public:
spu_llvm_compile_scope(spu_llvm_compile_context& context, bool use_tbl2) noexcept
{
context = {};
context.use_tbl2 = use_tbl2;
spu_llvm_set_compile_context(&context);
}
~spu_llvm_compile_scope() noexcept
{
spu_llvm_set_compile_context(nullptr);
}
};
static spu_program analyse_spu_llvm_program(spu_recompiler_base& compiler, const spu_program& program)
{
std::vector<be_t<u32>> ls(SPU_LS_SIZE / sizeof(be_t<u32>));
for (u32 i = 0, pos = program.lower_bound; i < program.data.size(); i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(program.data[i]);
}
return compiler.analyse(ls.data(), program.entry_point);
}
static spu_function_t compile_spu_llvm_with_retry(std::unique_ptr<spu_recompiler_base>& compiler, const spu_program& program)
{
spu_llvm_compile_context context;
{
spu_llvm_compile_scope scope(context, true);
if (const auto result = compiler->compile(spu_program{program}))
{
return result;
}
}
if (context.llvm_error.find(s_spu_llvm_reg_scavenge_error) == std::string::npos)
{
if (!context.llvm_error.empty())
{
spu_log.error("LLVM failed to compile SPU block 0x%x: %s", program.entry_point, context.llvm_error);
}
return nullptr;
}
spu_log.warning("LLVM failed to compile SPU block 0x%x with TBL2/TBX2: %s. Retrying without TBL2/TBX2.", program.entry_point, context.llvm_error);
// LLVM fatal recovery does not unwind MCJIT state. Abandon the failed
// compiler and retry from a fresh analysis/JIT instance.
static_cast<void>(compiler.release());
compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();
const auto retry_program = analyse_spu_llvm_program(*compiler, program);
if (retry_program != program)
{
spu_log.error("[0x%05x] SPU analyser failed during TBL2/TBX2 retry, %u vs %u", retry_program.entry_point, retry_program.data.size(), program.data.size());
return nullptr;
}
spu_llvm_compile_context retry_context;
spu_llvm_compile_scope scope(retry_context, false);
const auto result = compiler->compile(spu_program{retry_program});
if (result)
{
spu_log.notice("SPU LLVM block 0x%x compiled successfully without TBL2/TBX2.", program.entry_point);
}
else if (!retry_context.llvm_error.empty())
{
spu_log.error("LLVM failed to compile SPU block 0x%x without TBL2/TBX2: %s", program.entry_point, retry_context.llvm_error);
}
return result;
}
#endif
// Move 4 args for calling native function from a GHC calling convention function
#if defined(ARCH_X64)
static u8* move_args_ghc_to_native(u8* raw)
@ -906,6 +994,15 @@ void spu_cache::initialize(bool build_existing_cache)
compiler->init();
auto compile_program = [&](spu_program&& program) -> spu_function_t
{
#ifdef ARCH_ARM64
return compile_spu_llvm_with_retry(compiler, program);
#else
return compiler->compile(std::move(program));
#endif
};
// Counter for error reporting
u32 logged_error = 0;
@ -977,7 +1074,7 @@ void spu_cache::initialize(bool build_existing_cache)
logged_error++;
}
}
else if (!compiler->compile(std::move(func2)))
else if (!compile_program(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
@ -1075,7 +1172,7 @@ void spu_cache::initialize(bool build_existing_cache)
const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
const u32 prog_size = ::size32(func2.data);
if (!compiler->compile(std::move(func2)))
if (!compile_program(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
@ -2096,7 +2193,12 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
return;
}
const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
auto program = spu.jit->analyse(spu._ptr<u32>(0), spu.pc);
#ifdef ARCH_ARM64
const auto func = compile_spu_llvm_with_retry(spu.jit, program);
#else
const auto func = spu.jit->compile(std::move(program));
#endif
if (!func)
{
@ -8903,8 +9005,20 @@ struct spu_llvm_worker
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (const auto target = compiler->compile(std::move(func2)))
else
{
#ifdef ARCH_ARM64
const auto target = compile_spu_llvm_with_retry(compiler, func2);
#else
const auto target = compiler->compile(std::move(func2));
#endif
if (!target)
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
break;
}
// Redirect old function (TODO: patch in multiple places)
const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;
@ -8922,11 +9036,6 @@ struct spu_llvm_worker
atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
}
else
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
break;
}
// Clear fake LS
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));

View File

@ -62,6 +62,16 @@ const extern spu_decoder<spu_iflag> g_spu_iflag;
#ifdef ARCH_ARM64
#include "Emu/CPU/Backends/AArch64/AArch64JIT.h"
namespace
{
thread_local spu_llvm_compile_context* g_spu_llvm_compile_context = nullptr;
}
void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept
{
g_spu_llvm_compile_context = context;
}
#endif
class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
@ -1669,6 +1679,15 @@ public:
m_hash_start = hash_start;
}
#ifdef ARCH_ARM64
m_use_tbl2 = !g_spu_llvm_compile_context || g_spu_llvm_compile_context->use_tbl2;
if (g_spu_llvm_compile_context)
{
g_spu_llvm_compile_context->llvm_error.clear();
}
#endif
spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);
m_pos = func.lower_bound;
@ -3478,17 +3497,63 @@ public:
} _jit_guard;
#endif
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
#ifdef ARCH_ARM64
const bool recoverable = !!g_spu_llvm_compile_context;
m_jit.fin();
if (recoverable)
{
bool added = false;
std::string& llvm_error = g_spu_llvm_compile_context->llvm_error;
if (g_cfg.core.spu_debug)
{
// Testing only
added = m_jit.try_add(std::move(_module), m_spurt->get_cache_path() + "llvm/", llvm_error);
}
else
{
added = m_jit.try_add(std::move(_module), llvm_error);
}
if (!added || !m_jit.try_fin(llvm_error))
{
if (add_to_file)
{
add_loc->cached = 0;
}
return nullptr;
}
}
else
{
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
m_jit.fin();
}
#else
if (g_cfg.core.spu_debug)
{
// Testing only
m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/");
}
else
{
m_jit.add(std::move(_module));
}
m_jit.fin();
#endif
}
// Register function pointer
const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));
@ -6757,6 +6822,73 @@ public:
const auto a = get_vr<u8[16]>(op.ra);
const auto b = get_vr<u8[16]>(op.rb);
#ifdef ARCH_ARM64
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
{
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
{
if (op.ra == op.rb)
{
if (perm_only)
{
const auto cm = eval(c & 0x0f);
set_vr(op.rt4, tbl(as, cm));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x8f);
set_vr(op.rt4, tbx(x, as, cm));
return;
}
if (perm_only)
{
const auto cm = eval(c & 0x1f);
set_vr(op.rt4, tbl2(as, bs, cm));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x9f);
set_vr(op.rt4, tbx2(x, as, bs, cm));
return;
}
}
if (op.ra == op.rb && !m_interp_magn)
{
if (perm_only)
{
const auto cm = eval(c & 0x0f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbl(a, cr));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x8f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbx(x, a, cr));
return;
}
if (perm_only)
{
const auto cm = eval(c & 0x9f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbl2(a, b, cr));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
// AND should be before XOR so that llvm can combine them into BCAX
// Though for some reason it doesn't seem to be doing that.
const auto cm = eval(c & ~0x60);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbx2(x, a, b, cr));
return;
#else
// Data with swapped endian from a load instruction
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
{
@ -6900,6 +7032,7 @@ public:
set_vr(op.rt4, select_by_bit4(cr, ax, bx));
else
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
#endif
}
void MPYA(spu_opcode_t op)

View File

@ -77,6 +77,16 @@ struct spu_program
bool operator<(const spu_program& rhs) const noexcept;
};
#ifdef ARCH_ARM64
struct spu_llvm_compile_context
{
bool use_tbl2 = true;
std::string llvm_error;
};
void spu_llvm_set_compile_context(spu_llvm_compile_context* context) noexcept;
#endif
class spu_item
{
public: