SPU LLVM: Implement reduced loop

This commit is contained in:
Elad 2026-02-10 18:45:17 +02:00
parent 452a9c42c9
commit 0a12e482f4
5 changed files with 639 additions and 10 deletions

View File

@ -3951,6 +3951,15 @@ public:
erase_stores({args.value...});
}
// Debug breakpoint
void debugtrap()
{
const auto _rty = llvm::Type::getVoidTy(m_context);
const auto type = llvm::FunctionType::get(_rty, {}, false);
const auto func = llvm::cast<llvm::Function>(m_ir->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction("llvm.debugtrap", type).getCallee());
m_ir->CreateCall(func);
}
template <typename T, typename U>
static auto pshufb(T&& a, U&& b)
{

View File

@ -8544,6 +8544,8 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
if (inst_attr attr = m_inst_attrs[(loop_pc - entry_point) / 4]; attr == inst_attr::none)
{
const u64 hash = loop_pc / 4 + read_from_ptr<be_t<u64>>(func_hash.data());
add_pattern(inst_attr::reduced_loop, loop_pc - result.entry_point, 0, std::make_shared<reduced_loop_t>(pattern));
std::string regs = "{";

View File

@ -139,7 +139,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
spu_recompiler_base::block_info* bb{};
// Current block's entry block
llvm::BasicBlock* block;
llvm::BasicBlock* block{};
// Final block (for PHI nodes, set after completion)
llvm::BasicBlock* block_end{};
@ -155,6 +155,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
// Store instructions
std::array<llvm::StoreInst*, s_reg_max> store{};
bool block_wide_reg_store_elimination = false;
// Store reordering/elimination protection
std::array<usz, s_reg_max> store_context_last_id = fill_array<usz>(0); // Protects against illegal forward ordering
@ -364,7 +365,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
{
if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127))
{
m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i));
m_block->reg[i] = get_reg_fixed(i, get_reg_type(i));
}
}
@ -709,6 +710,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
if (!reg)
{
if (m_block && m_block->block_wide_reg_store_elimination)
{
fmt::throw_exception("Unexpected load: [%s] at 0x%x (gpr=r%d)", m_hash, m_pos, index);
}
// Load register value if necessary
reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index));
}
@ -920,6 +926,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
if (m_block)
{
if (m_block->block_wide_reg_store_elimination)
{
// Don't save registers for the current block iteration
// Affected optimizations:
// 1. Single-block reduced loop
return;
}
// Keep the store's location in history of gpr preservaions
m_block->store_context_last_id[index] = m_block->store_context_ctr[index];
m_block->store_context_first_id[index] = std::min<usz>(m_block->store_context_first_id[index], m_block->store_context_ctr[index]);
@ -2059,6 +2073,43 @@ public:
bool need_check = false;
m_block->bb = &bb;
// [1gJ45f2-0x00a40]: 16.4982% (113258)
// [ZsQTud1-0x0924c]: 6.1202% (42014)
// [ZsQTud1-0x08e54]: 5.6610% (38862)
// [0000000-0x3fffc]: 4.3764% (30043)
// [Zh4tpJM-0x00bcc]: 3.7908% (26023)
// [CFt8hXu-0x063b8]: 3.6177% (24835)
// [8YJCUjv-0x0ad18]: 3.2417% (22254)
// [Try3XHn-0x0f018]: 2.3721% (16284)
// [s6ti9iu-0x07678]: 1.8464% (12675)
// [oyxkAPv-0x0c22c]: 1.7776% (12203)
// [Q0jLqH4-0x00324]: 1.6015% (10994)
static const std::array<std::pair<std::string, u32>, 4> to_nop
{
{ }
};
bool found_block = false;
for (auto& [hash, pos] : to_nop)
{
if (m_hash.find(hash) <= 2 && baddr == pos)
{
found_block = true;
break;
}
}
if (found_block)
{
for (u32 i = 0; i < 100; i++)
{
auto value = m_ir->CreateLoad(get_type<f32>(), spu_ptr(&spu_thread::last_getllar_lsa));
auto mod_val = m_ir->CreateFDiv(value, llvm::ConstantFP::get(value->getType(), 1.1 + i));
m_ir->CreateStore(value, spu_ptr(&spu_thread::last_getllar_lsa));
}
}
if (!bb.preds.empty())
{
// Initialize registers and build PHI nodes if necessary
@ -2174,6 +2225,485 @@ public:
check_state(baddr);
}
const bool is_reduced_loop = m_inst_attrs[(baddr - start) / 4] == inst_attr::reduced_loop;
const auto reduced_loop_info = is_reduced_loop ? std::static_pointer_cast<reduced_loop_t>(ensure(m_patterns.at(baddr - start).info_ptr)) : nullptr;
BasicBlock* block_optimization_phi_parent = nullptr;
const auto block_optimization_inner = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-it-0x%x", m_pos), m_function) : nullptr;
const auto block_optimization_exit_early = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-exit-0x%x", m_pos), m_function) : nullptr;
const auto block_optimization_next = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b2-0x%x", m_pos), m_function) : nullptr;
std::array<llvm::PHINode*, s_reg_max> reduced_loop_phi_nodes{};
std::array<llvm::Value*, s_reg_max> reduced_loop_init_regs{};
auto make_reduced_loop_condition = [&](llvm::BasicBlock* optimization_block, bool is_second_time, u32 reserve_iterations)
{
llvm::ICmpInst::Predicate compare{};
switch (reduced_loop_info->cond_val_compare)
{
case CMP_SLESS: compare = ICmpInst::ICMP_SLT; break;
case CMP_SGREATER: compare = ICmpInst::ICMP_SGT; break;
case CMP_EQUAL: compare = ICmpInst::ICMP_EQ; break;
case CMP_LLESS: compare = ICmpInst::ICMP_ULT; break;
case CMP_LGREATER: compare = ICmpInst::ICMP_UGT; break;
case CMP_SGREATER_EQUAL: compare = ICmpInst::ICMP_SGE; break;
case CMP_SLOWER_EQUAL: compare = ICmpInst::ICMP_SLE; break;
case CMP_NOT_EQUAL: compare = ICmpInst::ICMP_NE; break;
case CMP_LGREATER_EQUAL: compare = ICmpInst::ICMP_UGE; break;
case CMP_LLOWER_EQUAL: compare = ICmpInst::ICMP_ULE; break;
{
break;
}
case CMP_UNKNOWN:
case CMP_NOT_EQUAL2:
case CMP_EQUAL2:
default:
{
ensure(false);
break;
}
}
llvm::Value* loop_dictator_before_adjustment{};
llvm::Value* loop_dictator_after_adjustment{};
spu_opcode_t reg_target{};
reg_target.rt = reduced_loop_info->cond_val_register_idx;
if (reg_target.rt != reduced_loop_info->cond_val_register_idx)
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition register index: 0x%llx", reduced_loop_info->cond_val_register_idx);
}
if (!m_block->reg[reg_target.rt])
{
m_block->reg[reg_target.rt] = reduced_loop_init_regs[reg_target.rt];
}
switch (reduced_loop_info->cond_val_mask)
{
case u8{umax}:
{
loop_dictator_before_adjustment = get_scalar(get_vr<u8[16]>(reg_target.rt)).eval(m_ir);
break;
}
case u16{umax}:
{
loop_dictator_before_adjustment = get_scalar(get_vr<u16[8]>(reg_target.rt)).eval(m_ir);
break;
}
case u32{umax}:
{
loop_dictator_before_adjustment = get_scalar(get_vr<u32[4]>(reg_target.rt)).eval(m_ir);
break;
}
case u64{umax}:
{
ensure(false); // TODO
loop_dictator_before_adjustment = get_scalar(get_vr<u64[2]>(reg_target.rt)).eval(m_ir);
break;
}
default:
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition bit mask: 0x%llx", reduced_loop_info->cond_val_mask);
}
}
const u32 type_bits = std::popcount(reduced_loop_info->cond_val_mask);
llvm::Value* cond_val_incr = nullptr;
if (reduced_loop_info->cond_val_incr_is_immediate)
{
cond_val_incr = m_ir->getIntN(type_bits, reduced_loop_info->cond_val_incr & reduced_loop_info->cond_val_mask);
}
else
{
spu_opcode_t reg_incr{};
reg_incr.rt = reduced_loop_info->cond_val_incr;
if (reg_incr.rt != reduced_loop_info->cond_val_incr)
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal increment arguemnt register index: 0x%llx", reduced_loop_info->cond_val_incr);
}
switch (reduced_loop_info->cond_val_mask)
{
case u8{umax}:
{
cond_val_incr = get_scalar(get_vr<u8[16]>(reg_incr.rt)).eval(m_ir);
break;
}
case u16{umax}:
{
cond_val_incr = get_scalar(get_vr<u16[8]>(reg_incr.rt)).eval(m_ir);
break;
}
case u32{umax}:
{
cond_val_incr = get_scalar(get_vr<u32[4]>(reg_incr.rt)).eval(m_ir);
break;
}
case u64{umax}:
{
ensure(false); // TODO
cond_val_incr = get_scalar(get_vr<u64[2]>(reg_incr.rt)).eval(m_ir);
break;
}
}
}
if (reduced_loop_info->cond_val_incr_before_cond && !reduced_loop_info->cond_val_incr_before_cond_taken_in_account)
{
loop_dictator_after_adjustment = m_ir->CreateAdd(loop_dictator_before_adjustment, cond_val_incr);
}
else
{
loop_dictator_after_adjustment = loop_dictator_before_adjustment;
}
llvm::Value* loop_argument = nullptr;
if (reduced_loop_info->cond_val_is_immediate)
{
loop_argument = m_ir->CreateTrunc(m_ir->getInt64(reduced_loop_info->cond_val_min & reduced_loop_info->cond_val_mask), loop_dictator_before_adjustment->getType());
}
else
{
spu_opcode_t reg_target2{};
reg_target2.rt = reduced_loop_info->cond_val_register_argument_idx;
if (reg_target2.rt != reduced_loop_info->cond_val_register_argument_idx)
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition arguemnt register index: 0x%llx", reduced_loop_info->cond_val_register_argument_idx);
}
switch (reduced_loop_info->cond_val_mask)
{
case u8{umax}:
{
loop_argument = get_scalar(get_vr<u8[16]>(reg_target2.rt)).eval(m_ir);
break;
}
case u16{umax}:
{
loop_argument = get_scalar(get_vr<u16[8]>(reg_target2.rt)).eval(m_ir);
break;
}
case u32{umax}:
{
loop_argument = get_scalar(get_vr<u32[4]>(reg_target2.rt)).eval(m_ir);
break;
}
case u64{umax}:
{
ensure(false); // TODO
loop_argument = get_scalar(get_vr<u64[2]>(reg_target2.rt)).eval(m_ir);
break;
}
}
}
llvm::Value* condition = nullptr;
if (reserve_iterations == 1)
{
condition = m_ir->CreateICmp(compare, loop_dictator_after_adjustment, loop_argument);
}
// else if ((reduced_loop_info->cond_val_compare == CMP_LGREATER || (reduced_loop_info->cond_val_compare == CMP_LGREATER_EQUAL && reduced_loop_info->cond_val_is_immediate && reduced_loop_info->cond_val_incr)) && cond_val_incr->getSExtValue() < 0)
// {
// const auto cond_val_incr_multiplied = m_ir->CreateMul(cond_val_incr, reserve_iterations - 1);
// condition = m_ir->CreateICmp(compare, select(m_ir->CreateICmpUGE(cond_val_incr_multiplied, loop_dictator_after_adjustment), m_ir->CreateAdd(loop_dictator_after_adjustment, cond_val_incr_multiplied), m_ir->getIntN(type_bits, 0)), loop_argument);
// }
else
{
//debugtrap();
llvm::Value* prev_it = loop_dictator_after_adjustment;
for (u32 i = 0; i < reserve_iterations; i++)
{
if (i)
{
prev_it = m_ir->CreateAdd(prev_it, cond_val_incr);
}
const auto also_cond = m_ir->CreateICmp(compare, prev_it, loop_argument);
condition = condition ? m_ir->CreateAnd(condition, also_cond) : also_cond;
}
}
if (!is_second_time)
{
for (u32 i = 0, count = 0, prev_i = umax;; i++)
{
const bool is_last = !(count <= 20 && i < s_reg_max);
if (is_last || m_block->is_gpr_not_NaN_hint(i))
{
count++;
if (prev_i == umax)
{
if (!is_last)
{
prev_i = i;
continue;
}
break;
}
auto access_gpr = [&](u32 index)
{
spu_opcode_t op_arg{};
op_arg.ra = index;
return get_vr<u32[4]>(op_arg.ra);
};
// OR LSB to convert infinity to NaN
llvm::Value* arg1 = bitcast<f32[4]>(access_gpr(prev_i) | splat<u32[4]>(1)).eval(m_ir);
llvm::Value* arg2 = is_last ? arg1 : bitcast<f32[4]>(access_gpr(i) | splat<u32[4]>(1)).eval(m_ir);
llvm::Value* acc = m_ir->CreateSExt(m_ir->CreateFCmpUNO(arg1, arg2), get_type<s32[4]>());
// Pattern for PTEST
acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
for (u64 i = 1; i < 2; i++)
{
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
}
// Compare result with zero
const auto cond_nans = m_ir->CreateICmpEQ(elem, m_ir->getInt64(0));
condition = m_ir->CreateAnd(cond_nans, condition);
prev_i = umax;
}
}
}
//condition = m_ir->getInt1(0);
m_ir->CreateCondBr(condition, optimization_block, block_optimization_next);
};
if (is_reduced_loop)
{
for (u32 i = 0; i < s_reg_max; i++)
{
llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type<f64[4]>() : get_reg_type(i);
if (i < reduced_loop_info->loop_dicts.size() && (reduced_loop_info->loop_dicts.test(i) || reduced_loop_info->loop_writes.test(i)))
{
// Connect registers which are used and then modified by the block
auto value = m_block->reg[i];
if (!value || value->getType() != type)
{
value = get_reg_fixed(i, type);
}
reduced_loop_init_regs[i] = value;
}
else if (i < reduced_loop_info->loop_dicts.size() && reduced_loop_info->loop_args.test(i))
{
// Load registers used as arguments of the loop
if (!m_block->reg[i])
{
m_block->reg[i] = get_reg_fixed(i, type);
}
}
}
const auto prev_insert_block = m_ir->GetInsertBlock();
block_optimization_phi_parent = prev_insert_block;
make_reduced_loop_condition(block_optimization_inner, false, 2);
m_ir->SetInsertPoint(block_optimization_inner);
for (u32 i = 0; i < s_reg_max; i++)
{
if (auto init_val = reduced_loop_init_regs[i])
{
llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type<f64[4]>() : get_reg_type(i);
const auto _phi = m_ir->CreatePHI(init_val->getType(), 2, fmt::format("reduced_0x%05x_r%u", baddr, i));
_phi->addIncoming(init_val, prev_insert_block);
reduced_loop_phi_nodes[i] = _phi;
m_block->reg[i] = _phi;
}
}
m_block->block_wide_reg_store_elimination = true;
}
// Instructions emitting optimizations: Loop iteration is not the last
m_pos = baddr;
// Masked opcodde -> register modification times
std::map<u32, std::pair<llvm::Value*, std::array<u32, 3>>> masked_times;
std::array<u32, s_reg_max + 1> reg_states{};
u32 s_reg_state{1};
for (u32 iteration_emit = 0; is_reduced_loop; m_pos += 4)
{
if (m_pos != baddr && m_block_info[m_pos / 4] && reduced_loop_info->loop_end < m_pos)
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(1) too early at 0x%x", m_pos);
}
if (!(m_pos >= start && m_pos < end))
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(2) too early at 0x%x", m_pos);
}
if (m_ir->GetInsertBlock()->getTerminator())
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(3) too early at 0x%x", m_pos);
}
const u32 op = std::bit_cast<be_t<u32>>(func.data[(m_pos - start) / 4]);
const auto itype = g_spu_itype.decode(op);
if (itype & spu_itype::branch)
{
bool branches_back = false;
for (u32 dest : op_branch_targets(m_pos, spu_opcode_t{op}))
{
branches_back = branches_back || dest == baddr;
}
if (!branches_back)
{
continue;
}
iteration_emit++;
if (iteration_emit < 2)
{
// Reset mpos (with fixup)
m_pos = baddr - 4;
continue;
}
// Optimization block body
const auto block_inner = m_ir->GetInsertBlock();
std::array<llvm::Value*, s_reg_max> block_reg_results{};
for (u32 i = 0; i < s_reg_max; i++)
{
if (auto phi = reduced_loop_phi_nodes[i])
{
const auto type = phi->getType() == get_type<f64[4]>() ? get_type<f64[4]>() : get_reg_type(i);
block_reg_results[i] = ensure(get_reg_fixed(i, type));
phi->addIncoming(block_reg_results[i], block_inner);
}
}
ensure(!!m_block->reg[reduced_loop_info->cond_val_register_idx]);
make_reduced_loop_condition(block_optimization_inner, true, 2);
m_ir->SetInsertPoint(block_optimization_next);
m_block->block_wide_reg_store_elimination = false;
for (u32 i = 0; i < s_reg_max; i++)
{
if (const auto loop_value = block_reg_results[i])
{
const auto phi = m_ir->CreatePHI(loop_value->getType(), 2, fmt::format("redres_0x%05x_r%u", baddr, i));
phi->addIncoming(loop_value, block_inner);
phi->addIncoming(reduced_loop_init_regs[i], block_optimization_phi_parent);
m_block->reg[i] = phi;
}
}
break;
}
if (!op)
{
fmt::throw_exception("LLVM: Reduced Loop Pattern: [%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]);
}
const auto [reg_rt, reg_access, masked_op] = op_register_targets(m_pos, spu_opcode_t{op});
bool erased = false;
const auto inst_times = std::array<u32, 3>{reg_states[reg_access[0]], reg_states[reg_access[1]], reg_states[reg_access[2]]};
// Try to reuse the reult of the previous iteration (if argumnent registers have not been modified)
if (reg_rt < 128 && masked_times.count(masked_op) && masked_times[masked_op].first && m_inst_attrs[(m_pos - start) / 4] == inst_attr::none)
{
auto times = masked_times[masked_op].second;
bool is_ok = true;
for (u32 regi = 0; regi < 3; regi++)
{
if (reg_access[regi] < 128 && times[regi] != inst_times[regi])
{
is_ok = false;
}
}
if (is_ok)
{
m_block->reg[reg_rt] = masked_times[masked_op].first;
erased = true;
}
}
if (reg_rt < 128)
{
reg_states[reg_rt] = s_reg_state++;
}
if (erased)
{
continue;
}
m_next_op = 0;
masked_times[masked_op] = {};
switch (m_inst_attrs[(m_pos - start) / 4])
{
case inst_attr::putllc0:
{
putllc0_pattern(func, m_patterns.at(m_pos - start).info);
continue;
}
case inst_attr::putllc16:
{
putllc16_pattern(func, m_patterns.at(m_pos - start).info);
continue;
}
case inst_attr::omit:
{
// TODO
continue;
}
default: break;
}
// Execute recompiler function (TODO)
(this->*decode(op))({op});
if (reg_rt < 128 && itype & spu_itype::pure && reg_rt != reg_access[0] && reg_rt != reg_access[1] && reg_rt != reg_access[2])
{
masked_times[masked_op] = {ensure(m_block->reg[reg_rt]), inst_times};
}
}
// Emit instructions
for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4)
{
@ -7824,13 +8354,51 @@ public:
}
}
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
const auto a = get_vr(op.ra);
if (auto [ok, x, y] = match_expr(a, match<u32[4]>() + match<u32[4]>()); ok)
{
if (auto [ok1, data] = get_const_vector(x.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0)
{
value_t<u64> addr = eval(zext<u64>(extract(y, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
return;
}
if (auto [ok2, data] = get_const_vector(y.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0)
{
value_t<u64> addr = eval(zext<u64>(extract(x, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
return;
}
}
value_t<u64> addr = eval(zext<u64>(extract(a, 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
}
void LQD(spu_opcode_t op)
{
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
const auto a = get_vr(op.ra);
if (auto [ok, x1, y1] = match_expr(a, match<u32[4]>() + match<u32[4]>()); ok)
{
if (auto [ok1, data] = get_const_vector(x1.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0)
{
value_t<u64> addr = eval(zext<u64>(extract(y1, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
set_vr(op.rt, make_load_ls(addr));
return;
}
if (auto [ok2, data] = get_const_vector(y1.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0)
{
value_t<u64> addr = eval(zext<u64>(extract(x1, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
set_vr(op.rt, make_load_ls(addr));
return;
}
}
value_t<u64> addr = eval(zext<u64>(extract(a, 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
set_vr(op.rt, make_load_ls(addr));
}

View File

@ -56,6 +56,7 @@ constexpr u32 spu_decode(u32 inst)
}
std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op);
std::tuple<u32, std::array<u32, 3>, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op);
// SPU decoder object. D provides functions. T is function pointer type returned.
template <typename D, typename T = decltype(&D::UNK)>

View File

@ -495,7 +495,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write);
extern thread_local u64 g_tls_fault_spu;
const spu_decoder<spu_itype> s_spu_itype;
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iflag> g_spu_iflag;
namespace vm
{
@ -598,7 +599,7 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
{
std::array<u32, 2> res{spu_branch_target(pc + 4), umax};
switch (const auto type = s_spu_itype.decode(op.opcode))
switch (const auto type = g_spu_itype.decode(op.opcode))
{
case spu_itype::BR:
case spu_itype::BRA:
@ -639,6 +640,54 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
return res;
}
std::tuple<u32, std::array<u32, 3>, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op)
{
std::tuple<u32, std::array<u32, 3>, u32> result{u32{umax}, std::array<u32, 3>{128, 128, 128}, op.opcode};
const auto type = g_spu_itype.decode(op.opcode);
if (type & spu_itype::zregmod)
{
std::get<2>(result) = 0;
return result;
}
std::get<0>(result) = type & spu_itype::_quadrop ? op.rt4 : op.rt;
spu_opcode_t op_masked = op;
if (type & spu_itype::_quadrop)
{
op_masked.rt4 = 0;
}
else
{
op_masked.rt = 0;
}
std::get<2>(result) = op_masked.opcode;
if (auto iflags = g_spu_iflag.decode(op.opcode))
{
if (+iflags & +spu_iflag::use_ra)
{
std::get<1>(result)[0] = op.ra;
}
if (+iflags & +spu_iflag::use_rb)
{
std::get<1>(result)[1] = op.rb;
}
if (+iflags & +spu_iflag::use_rc)
{
std::get<1>(result)[2] = op.rc;
}
}
return result;
}
void spu_int_ctrl_t::set(u64 ints)
{
// leave only enabled interrupts
@ -988,7 +1037,7 @@ std::vector<std::pair<u32, u32>> spu_thread::dump_callstack_list() const
passed[i / 4] = true;
const spu_opcode_t op{_ref<u32>(i)};
const auto type = s_spu_itype.decode(op.opcode);
const auto type = g_spu_itype.decode(op.opcode);
if (start == 0 && type == spu_itype::STQD && op.ra == 1u && op.rt == 0u)
{
@ -3761,7 +3810,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
const u32 addr0 = spu_branch_target(addr);
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls_ptr, addr0 - base_addr)};
const auto type = s_spu_itype.decode(op.opcode);
const auto type = g_spu_itype.decode(op.opcode);
if (type == spu_itype::UNK || !op.opcode)
{
@ -3907,7 +3956,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
// Test the validity of a single instruction of the optional target
// This function can't be too slow and is unlikely to improve results by a great deal
const u32 op0 = read_from_ptr<be_t<u32>>(ls_ptr, route_pc - base_addr);
const spu_itype::type type0 = s_spu_itype.decode(op0);
const spu_itype::type type0 = g_spu_itype.decode(op0);
if (type0 == spu_itype::UNK || !op0)
{
@ -6878,7 +6927,7 @@ spu_exec_object spu_thread::capture_memory_as_elf(std::span<spu_memory_segment_d
const u32 op = read_from_ptr<be_t<u32>>(all_data, pc0 - 4);
// Try to find function entry (if they are placed sequentially search for BI $LR of previous function)
if (!op || op == 0x35000000u || s_spu_itype.decode(op) == spu_itype::UNK)
if (!op || op == 0x35000000u || g_spu_itype.decode(op) == spu_itype::UNK)
{
if (is_exec_code(pc0, { all_data.data(), SPU_LS_SIZE }))
break;