mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-03-26 12:28:34 -06:00
SPU LLVM: Implement reduced loop
This commit is contained in:
parent
452a9c42c9
commit
0a12e482f4
@ -3951,6 +3951,15 @@ public:
|
||||
erase_stores({args.value...});
|
||||
}
|
||||
|
||||
// Debug breakpoint
|
||||
void debugtrap()
|
||||
{
|
||||
const auto _rty = llvm::Type::getVoidTy(m_context);
|
||||
const auto type = llvm::FunctionType::get(_rty, {}, false);
|
||||
const auto func = llvm::cast<llvm::Function>(m_ir->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction("llvm.debugtrap", type).getCallee());
|
||||
m_ir->CreateCall(func);
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static auto pshufb(T&& a, U&& b)
|
||||
{
|
||||
|
||||
@ -8544,6 +8544,8 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
|
||||
|
||||
if (inst_attr attr = m_inst_attrs[(loop_pc - entry_point) / 4]; attr == inst_attr::none)
|
||||
{
|
||||
const u64 hash = loop_pc / 4 + read_from_ptr<be_t<u64>>(func_hash.data());
|
||||
|
||||
add_pattern(inst_attr::reduced_loop, loop_pc - result.entry_point, 0, std::make_shared<reduced_loop_t>(pattern));
|
||||
|
||||
std::string regs = "{";
|
||||
|
||||
@ -139,7 +139,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
spu_recompiler_base::block_info* bb{};
|
||||
|
||||
// Current block's entry block
|
||||
llvm::BasicBlock* block;
|
||||
llvm::BasicBlock* block{};
|
||||
|
||||
// Final block (for PHI nodes, set after completion)
|
||||
llvm::BasicBlock* block_end{};
|
||||
@ -155,6 +155,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
|
||||
// Store instructions
|
||||
std::array<llvm::StoreInst*, s_reg_max> store{};
|
||||
bool block_wide_reg_store_elimination = false;
|
||||
|
||||
// Store reordering/elimination protection
|
||||
std::array<usz, s_reg_max> store_context_last_id = fill_array<usz>(0); // Protects against illegal forward ordering
|
||||
@ -364,7 +365,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
{
|
||||
if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127))
|
||||
{
|
||||
m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i));
|
||||
m_block->reg[i] = get_reg_fixed(i, get_reg_type(i));
|
||||
}
|
||||
}
|
||||
|
||||
@ -709,6 +710,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
|
||||
if (!reg)
|
||||
{
|
||||
if (m_block && m_block->block_wide_reg_store_elimination)
|
||||
{
|
||||
fmt::throw_exception("Unexpected load: [%s] at 0x%x (gpr=r%d)", m_hash, m_pos, index);
|
||||
}
|
||||
|
||||
// Load register value if necessary
|
||||
reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index));
|
||||
}
|
||||
@ -920,6 +926,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
|
||||
if (m_block)
|
||||
{
|
||||
if (m_block->block_wide_reg_store_elimination)
|
||||
{
|
||||
// Don't save registers for the current block iteration
|
||||
// Affected optimizations:
|
||||
// 1. Single-block reduced loop
|
||||
return;
|
||||
}
|
||||
|
||||
// Keep the store's location in history of gpr preservaions
|
||||
m_block->store_context_last_id[index] = m_block->store_context_ctr[index];
|
||||
m_block->store_context_first_id[index] = std::min<usz>(m_block->store_context_first_id[index], m_block->store_context_ctr[index]);
|
||||
@ -2059,6 +2073,43 @@ public:
|
||||
bool need_check = false;
|
||||
m_block->bb = &bb;
|
||||
|
||||
// [1gJ45f2-0x00a40]: 16.4982% (113258)
|
||||
// [ZsQTud1-0x0924c]: 6.1202% (42014)
|
||||
// [ZsQTud1-0x08e54]: 5.6610% (38862)
|
||||
// [0000000-0x3fffc]: 4.3764% (30043)
|
||||
// [Zh4tpJM-0x00bcc]: 3.7908% (26023)
|
||||
// [CFt8hXu-0x063b8]: 3.6177% (24835)
|
||||
// [8YJCUjv-0x0ad18]: 3.2417% (22254)
|
||||
// [Try3XHn-0x0f018]: 2.3721% (16284)
|
||||
// [s6ti9iu-0x07678]: 1.8464% (12675)
|
||||
// [oyxkAPv-0x0c22c]: 1.7776% (12203)
|
||||
// [Q0jLqH4-0x00324]: 1.6015% (10994)
|
||||
static const std::array<std::pair<std::string, u32>, 4> to_nop
|
||||
{
|
||||
{ }
|
||||
};
|
||||
|
||||
bool found_block = false;
|
||||
|
||||
for (auto& [hash, pos] : to_nop)
|
||||
{
|
||||
if (m_hash.find(hash) <= 2 && baddr == pos)
|
||||
{
|
||||
found_block = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block)
|
||||
{
|
||||
for (u32 i = 0; i < 100; i++)
|
||||
{
|
||||
auto value = m_ir->CreateLoad(get_type<f32>(), spu_ptr(&spu_thread::last_getllar_lsa));
|
||||
auto mod_val = m_ir->CreateFDiv(value, llvm::ConstantFP::get(value->getType(), 1.1 + i));
|
||||
m_ir->CreateStore(value, spu_ptr(&spu_thread::last_getllar_lsa));
|
||||
}
|
||||
}
|
||||
|
||||
if (!bb.preds.empty())
|
||||
{
|
||||
// Initialize registers and build PHI nodes if necessary
|
||||
@ -2174,6 +2225,485 @@ public:
|
||||
check_state(baddr);
|
||||
}
|
||||
|
||||
const bool is_reduced_loop = m_inst_attrs[(baddr - start) / 4] == inst_attr::reduced_loop;
|
||||
const auto reduced_loop_info = is_reduced_loop ? std::static_pointer_cast<reduced_loop_t>(ensure(m_patterns.at(baddr - start).info_ptr)) : nullptr;
|
||||
|
||||
BasicBlock* block_optimization_phi_parent = nullptr;
|
||||
const auto block_optimization_inner = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-it-0x%x", m_pos), m_function) : nullptr;
|
||||
const auto block_optimization_exit_early = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-exit-0x%x", m_pos), m_function) : nullptr;
|
||||
const auto block_optimization_next = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b2-0x%x", m_pos), m_function) : nullptr;
|
||||
|
||||
std::array<llvm::PHINode*, s_reg_max> reduced_loop_phi_nodes{};
|
||||
std::array<llvm::Value*, s_reg_max> reduced_loop_init_regs{};
|
||||
|
||||
auto make_reduced_loop_condition = [&](llvm::BasicBlock* optimization_block, bool is_second_time, u32 reserve_iterations)
|
||||
{
|
||||
llvm::ICmpInst::Predicate compare{};
|
||||
|
||||
switch (reduced_loop_info->cond_val_compare)
|
||||
{
|
||||
case CMP_SLESS: compare = ICmpInst::ICMP_SLT; break;
|
||||
case CMP_SGREATER: compare = ICmpInst::ICMP_SGT; break;
|
||||
case CMP_EQUAL: compare = ICmpInst::ICMP_EQ; break;
|
||||
case CMP_LLESS: compare = ICmpInst::ICMP_ULT; break;
|
||||
case CMP_LGREATER: compare = ICmpInst::ICMP_UGT; break;
|
||||
case CMP_SGREATER_EQUAL: compare = ICmpInst::ICMP_SGE; break;
|
||||
case CMP_SLOWER_EQUAL: compare = ICmpInst::ICMP_SLE; break;
|
||||
case CMP_NOT_EQUAL: compare = ICmpInst::ICMP_NE; break;
|
||||
case CMP_LGREATER_EQUAL: compare = ICmpInst::ICMP_UGE; break;
|
||||
case CMP_LLOWER_EQUAL: compare = ICmpInst::ICMP_ULE; break;
|
||||
{
|
||||
break;
|
||||
}
|
||||
case CMP_UNKNOWN:
|
||||
case CMP_NOT_EQUAL2:
|
||||
case CMP_EQUAL2:
|
||||
default:
|
||||
{
|
||||
ensure(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value* loop_dictator_before_adjustment{};
|
||||
llvm::Value* loop_dictator_after_adjustment{};
|
||||
|
||||
spu_opcode_t reg_target{};
|
||||
reg_target.rt = reduced_loop_info->cond_val_register_idx;
|
||||
|
||||
if (reg_target.rt != reduced_loop_info->cond_val_register_idx)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition register index: 0x%llx", reduced_loop_info->cond_val_register_idx);
|
||||
}
|
||||
|
||||
if (!m_block->reg[reg_target.rt])
|
||||
{
|
||||
m_block->reg[reg_target.rt] = reduced_loop_init_regs[reg_target.rt];
|
||||
}
|
||||
|
||||
switch (reduced_loop_info->cond_val_mask)
|
||||
{
|
||||
case u8{umax}:
|
||||
{
|
||||
loop_dictator_before_adjustment = get_scalar(get_vr<u8[16]>(reg_target.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u16{umax}:
|
||||
{
|
||||
loop_dictator_before_adjustment = get_scalar(get_vr<u16[8]>(reg_target.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u32{umax}:
|
||||
{
|
||||
loop_dictator_before_adjustment = get_scalar(get_vr<u32[4]>(reg_target.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u64{umax}:
|
||||
{
|
||||
ensure(false); // TODO
|
||||
loop_dictator_before_adjustment = get_scalar(get_vr<u64[2]>(reg_target.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition bit mask: 0x%llx", reduced_loop_info->cond_val_mask);
|
||||
}
|
||||
}
|
||||
|
||||
const u32 type_bits = std::popcount(reduced_loop_info->cond_val_mask);
|
||||
|
||||
llvm::Value* cond_val_incr = nullptr;
|
||||
|
||||
if (reduced_loop_info->cond_val_incr_is_immediate)
|
||||
{
|
||||
cond_val_incr = m_ir->getIntN(type_bits, reduced_loop_info->cond_val_incr & reduced_loop_info->cond_val_mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
spu_opcode_t reg_incr{};
|
||||
reg_incr.rt = reduced_loop_info->cond_val_incr;
|
||||
|
||||
if (reg_incr.rt != reduced_loop_info->cond_val_incr)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal increment arguemnt register index: 0x%llx", reduced_loop_info->cond_val_incr);
|
||||
}
|
||||
switch (reduced_loop_info->cond_val_mask)
|
||||
{
|
||||
case u8{umax}:
|
||||
{
|
||||
cond_val_incr = get_scalar(get_vr<u8[16]>(reg_incr.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u16{umax}:
|
||||
{
|
||||
cond_val_incr = get_scalar(get_vr<u16[8]>(reg_incr.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u32{umax}:
|
||||
{
|
||||
cond_val_incr = get_scalar(get_vr<u32[4]>(reg_incr.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u64{umax}:
|
||||
{
|
||||
ensure(false); // TODO
|
||||
cond_val_incr = get_scalar(get_vr<u64[2]>(reg_incr.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reduced_loop_info->cond_val_incr_before_cond && !reduced_loop_info->cond_val_incr_before_cond_taken_in_account)
|
||||
{
|
||||
loop_dictator_after_adjustment = m_ir->CreateAdd(loop_dictator_before_adjustment, cond_val_incr);
|
||||
}
|
||||
else
|
||||
{
|
||||
loop_dictator_after_adjustment = loop_dictator_before_adjustment;
|
||||
}
|
||||
|
||||
llvm::Value* loop_argument = nullptr;
|
||||
|
||||
if (reduced_loop_info->cond_val_is_immediate)
|
||||
{
|
||||
loop_argument = m_ir->CreateTrunc(m_ir->getInt64(reduced_loop_info->cond_val_min & reduced_loop_info->cond_val_mask), loop_dictator_before_adjustment->getType());
|
||||
}
|
||||
else
|
||||
{
|
||||
spu_opcode_t reg_target2{};
|
||||
reg_target2.rt = reduced_loop_info->cond_val_register_argument_idx;
|
||||
|
||||
if (reg_target2.rt != reduced_loop_info->cond_val_register_argument_idx)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition arguemnt register index: 0x%llx", reduced_loop_info->cond_val_register_argument_idx);
|
||||
}
|
||||
|
||||
switch (reduced_loop_info->cond_val_mask)
|
||||
{
|
||||
case u8{umax}:
|
||||
{
|
||||
loop_argument = get_scalar(get_vr<u8[16]>(reg_target2.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u16{umax}:
|
||||
{
|
||||
loop_argument = get_scalar(get_vr<u16[8]>(reg_target2.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u32{umax}:
|
||||
{
|
||||
loop_argument = get_scalar(get_vr<u32[4]>(reg_target2.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
case u64{umax}:
|
||||
{
|
||||
ensure(false); // TODO
|
||||
loop_argument = get_scalar(get_vr<u64[2]>(reg_target2.rt)).eval(m_ir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value* condition = nullptr;
|
||||
|
||||
if (reserve_iterations == 1)
|
||||
{
|
||||
condition = m_ir->CreateICmp(compare, loop_dictator_after_adjustment, loop_argument);
|
||||
}
|
||||
// else if ((reduced_loop_info->cond_val_compare == CMP_LGREATER || (reduced_loop_info->cond_val_compare == CMP_LGREATER_EQUAL && reduced_loop_info->cond_val_is_immediate && reduced_loop_info->cond_val_incr)) && cond_val_incr->getSExtValue() < 0)
|
||||
// {
|
||||
// const auto cond_val_incr_multiplied = m_ir->CreateMul(cond_val_incr, reserve_iterations - 1);
|
||||
// condition = m_ir->CreateICmp(compare, select(m_ir->CreateICmpUGE(cond_val_incr_multiplied, loop_dictator_after_adjustment), m_ir->CreateAdd(loop_dictator_after_adjustment, cond_val_incr_multiplied), m_ir->getIntN(type_bits, 0)), loop_argument);
|
||||
// }
|
||||
else
|
||||
{
|
||||
//debugtrap();
|
||||
|
||||
llvm::Value* prev_it = loop_dictator_after_adjustment;
|
||||
|
||||
for (u32 i = 0; i < reserve_iterations; i++)
|
||||
{
|
||||
if (i)
|
||||
{
|
||||
prev_it = m_ir->CreateAdd(prev_it, cond_val_incr);
|
||||
}
|
||||
|
||||
const auto also_cond = m_ir->CreateICmp(compare, prev_it, loop_argument);
|
||||
condition = condition ? m_ir->CreateAnd(condition, also_cond) : also_cond;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_second_time)
|
||||
{
|
||||
for (u32 i = 0, count = 0, prev_i = umax;; i++)
|
||||
{
|
||||
const bool is_last = !(count <= 20 && i < s_reg_max);
|
||||
|
||||
if (is_last || m_block->is_gpr_not_NaN_hint(i))
|
||||
{
|
||||
count++;
|
||||
|
||||
if (prev_i == umax)
|
||||
{
|
||||
if (!is_last)
|
||||
{
|
||||
prev_i = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
auto access_gpr = [&](u32 index)
|
||||
{
|
||||
spu_opcode_t op_arg{};
|
||||
op_arg.ra = index;
|
||||
return get_vr<u32[4]>(op_arg.ra);
|
||||
};
|
||||
|
||||
// OR LSB to convert infinity to NaN
|
||||
llvm::Value* arg1 = bitcast<f32[4]>(access_gpr(prev_i) | splat<u32[4]>(1)).eval(m_ir);
|
||||
llvm::Value* arg2 = is_last ? arg1 : bitcast<f32[4]>(access_gpr(i) | splat<u32[4]>(1)).eval(m_ir);
|
||||
|
||||
llvm::Value* acc = m_ir->CreateSExt(m_ir->CreateFCmpUNO(arg1, arg2), get_type<s32[4]>());
|
||||
|
||||
// Pattern for PTEST
|
||||
acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
|
||||
|
||||
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
|
||||
|
||||
for (u64 i = 1; i < 2; i++)
|
||||
{
|
||||
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
|
||||
}
|
||||
|
||||
// Compare result with zero
|
||||
const auto cond_nans = m_ir->CreateICmpEQ(elem, m_ir->getInt64(0));
|
||||
condition = m_ir->CreateAnd(cond_nans, condition);
|
||||
prev_i = umax;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//condition = m_ir->getInt1(0);
|
||||
|
||||
m_ir->CreateCondBr(condition, optimization_block, block_optimization_next);
|
||||
};
|
||||
|
||||
if (is_reduced_loop)
|
||||
{
|
||||
for (u32 i = 0; i < s_reg_max; i++)
|
||||
{
|
||||
llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type<f64[4]>() : get_reg_type(i);
|
||||
|
||||
if (i < reduced_loop_info->loop_dicts.size() && (reduced_loop_info->loop_dicts.test(i) || reduced_loop_info->loop_writes.test(i)))
|
||||
{
|
||||
// Connect registers which are used and then modified by the block
|
||||
auto value = m_block->reg[i];
|
||||
|
||||
if (!value || value->getType() != type)
|
||||
{
|
||||
value = get_reg_fixed(i, type);
|
||||
}
|
||||
|
||||
reduced_loop_init_regs[i] = value;
|
||||
}
|
||||
else if (i < reduced_loop_info->loop_dicts.size() && reduced_loop_info->loop_args.test(i))
|
||||
{
|
||||
// Load registers used as arguments of the loop
|
||||
if (!m_block->reg[i])
|
||||
{
|
||||
m_block->reg[i] = get_reg_fixed(i, type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto prev_insert_block = m_ir->GetInsertBlock();
|
||||
|
||||
block_optimization_phi_parent = prev_insert_block;
|
||||
|
||||
make_reduced_loop_condition(block_optimization_inner, false, 2);
|
||||
m_ir->SetInsertPoint(block_optimization_inner);
|
||||
|
||||
for (u32 i = 0; i < s_reg_max; i++)
|
||||
{
|
||||
if (auto init_val = reduced_loop_init_regs[i])
|
||||
{
|
||||
llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type<f64[4]>() : get_reg_type(i);
|
||||
|
||||
const auto _phi = m_ir->CreatePHI(init_val->getType(), 2, fmt::format("reduced_0x%05x_r%u", baddr, i));
|
||||
_phi->addIncoming(init_val, prev_insert_block);
|
||||
|
||||
reduced_loop_phi_nodes[i] = _phi;
|
||||
m_block->reg[i] = _phi;
|
||||
}
|
||||
}
|
||||
|
||||
m_block->block_wide_reg_store_elimination = true;
|
||||
}
|
||||
|
||||
// Instructions emitting optimizations: Loop iteration is not the last
|
||||
m_pos = baddr;
|
||||
|
||||
// Masked opcodde -> register modification times
|
||||
std::map<u32, std::pair<llvm::Value*, std::array<u32, 3>>> masked_times;
|
||||
std::array<u32, s_reg_max + 1> reg_states{};
|
||||
u32 s_reg_state{1};
|
||||
|
||||
for (u32 iteration_emit = 0; is_reduced_loop; m_pos += 4)
|
||||
{
|
||||
if (m_pos != baddr && m_block_info[m_pos / 4] && reduced_loop_info->loop_end < m_pos)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(1) too early at 0x%x", m_pos);
|
||||
}
|
||||
|
||||
if (!(m_pos >= start && m_pos < end))
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(2) too early at 0x%x", m_pos);
|
||||
}
|
||||
|
||||
if (m_ir->GetInsertBlock()->getTerminator())
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(3) too early at 0x%x", m_pos);
|
||||
}
|
||||
|
||||
const u32 op = std::bit_cast<be_t<u32>>(func.data[(m_pos - start) / 4]);
|
||||
const auto itype = g_spu_itype.decode(op);
|
||||
|
||||
if (itype & spu_itype::branch)
|
||||
{
|
||||
bool branches_back = false;
|
||||
|
||||
for (u32 dest : op_branch_targets(m_pos, spu_opcode_t{op}))
|
||||
{
|
||||
branches_back = branches_back || dest == baddr;
|
||||
}
|
||||
|
||||
if (!branches_back)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
iteration_emit++;
|
||||
|
||||
if (iteration_emit < 2)
|
||||
{
|
||||
// Reset mpos (with fixup)
|
||||
m_pos = baddr - 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Optimization block body
|
||||
const auto block_inner = m_ir->GetInsertBlock();
|
||||
|
||||
std::array<llvm::Value*, s_reg_max> block_reg_results{};
|
||||
|
||||
for (u32 i = 0; i < s_reg_max; i++)
|
||||
{
|
||||
if (auto phi = reduced_loop_phi_nodes[i])
|
||||
{
|
||||
const auto type = phi->getType() == get_type<f64[4]>() ? get_type<f64[4]>() : get_reg_type(i);
|
||||
block_reg_results[i] = ensure(get_reg_fixed(i, type));
|
||||
phi->addIncoming(block_reg_results[i], block_inner);
|
||||
}
|
||||
}
|
||||
|
||||
ensure(!!m_block->reg[reduced_loop_info->cond_val_register_idx]);
|
||||
make_reduced_loop_condition(block_optimization_inner, true, 2);
|
||||
m_ir->SetInsertPoint(block_optimization_next);
|
||||
m_block->block_wide_reg_store_elimination = false;
|
||||
|
||||
for (u32 i = 0; i < s_reg_max; i++)
|
||||
{
|
||||
if (const auto loop_value = block_reg_results[i])
|
||||
{
|
||||
const auto phi = m_ir->CreatePHI(loop_value->getType(), 2, fmt::format("redres_0x%05x_r%u", baddr, i));
|
||||
|
||||
phi->addIncoming(loop_value, block_inner);
|
||||
phi->addIncoming(reduced_loop_init_regs[i], block_optimization_phi_parent);
|
||||
m_block->reg[i] = phi;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (!op)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Reduced Loop Pattern: [%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]);
|
||||
}
|
||||
|
||||
const auto [reg_rt, reg_access, masked_op] = op_register_targets(m_pos, spu_opcode_t{op});
|
||||
|
||||
bool erased = false;
|
||||
|
||||
const auto inst_times = std::array<u32, 3>{reg_states[reg_access[0]], reg_states[reg_access[1]], reg_states[reg_access[2]]};
|
||||
|
||||
// Try to reuse the reult of the previous iteration (if argumnent registers have not been modified)
|
||||
if (reg_rt < 128 && masked_times.count(masked_op) && masked_times[masked_op].first && m_inst_attrs[(m_pos - start) / 4] == inst_attr::none)
|
||||
{
|
||||
auto times = masked_times[masked_op].second;
|
||||
|
||||
bool is_ok = true;
|
||||
for (u32 regi = 0; regi < 3; regi++)
|
||||
{
|
||||
if (reg_access[regi] < 128 && times[regi] != inst_times[regi])
|
||||
{
|
||||
is_ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_ok)
|
||||
{
|
||||
m_block->reg[reg_rt] = masked_times[masked_op].first;
|
||||
erased = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (reg_rt < 128)
|
||||
{
|
||||
reg_states[reg_rt] = s_reg_state++;
|
||||
}
|
||||
|
||||
if (erased)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
m_next_op = 0;
|
||||
|
||||
masked_times[masked_op] = {};
|
||||
|
||||
switch (m_inst_attrs[(m_pos - start) / 4])
|
||||
{
|
||||
case inst_attr::putllc0:
|
||||
{
|
||||
putllc0_pattern(func, m_patterns.at(m_pos - start).info);
|
||||
continue;
|
||||
}
|
||||
case inst_attr::putllc16:
|
||||
{
|
||||
putllc16_pattern(func, m_patterns.at(m_pos - start).info);
|
||||
continue;
|
||||
}
|
||||
case inst_attr::omit:
|
||||
{
|
||||
// TODO
|
||||
continue;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
|
||||
// Execute recompiler function (TODO)
|
||||
(this->*decode(op))({op});
|
||||
|
||||
if (reg_rt < 128 && itype & spu_itype::pure && reg_rt != reg_access[0] && reg_rt != reg_access[1] && reg_rt != reg_access[2])
|
||||
{
|
||||
masked_times[masked_op] = {ensure(m_block->reg[reg_rt]), inst_times};
|
||||
}
|
||||
}
|
||||
|
||||
// Emit instructions
|
||||
for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4)
|
||||
{
|
||||
@ -7824,13 +8354,51 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
|
||||
const auto a = get_vr(op.ra);
|
||||
|
||||
if (auto [ok, x, y] = match_expr(a, match<u32[4]>() + match<u32[4]>()); ok)
|
||||
{
|
||||
if (auto [ok1, data] = get_const_vector(x.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0)
|
||||
{
|
||||
value_t<u64> addr = eval(zext<u64>(extract(y, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
|
||||
make_store_ls(addr, get_vr<u8[16]>(op.rt));
|
||||
return;
|
||||
}
|
||||
|
||||
if (auto [ok2, data] = get_const_vector(y.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0)
|
||||
{
|
||||
value_t<u64> addr = eval(zext<u64>(extract(x, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
|
||||
make_store_ls(addr, get_vr<u8[16]>(op.rt));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
value_t<u64> addr = eval(zext<u64>(extract(a, 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
|
||||
make_store_ls(addr, get_vr<u8[16]>(op.rt));
|
||||
}
|
||||
|
||||
void LQD(spu_opcode_t op)
|
||||
{
|
||||
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
|
||||
const auto a = get_vr(op.ra);
|
||||
|
||||
if (auto [ok, x1, y1] = match_expr(a, match<u32[4]>() + match<u32[4]>()); ok)
|
||||
{
|
||||
if (auto [ok1, data] = get_const_vector(x1.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0)
|
||||
{
|
||||
value_t<u64> addr = eval(zext<u64>(extract(y1, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
|
||||
set_vr(op.rt, make_load_ls(addr));
|
||||
return;
|
||||
}
|
||||
|
||||
if (auto [ok2, data] = get_const_vector(y1.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0)
|
||||
{
|
||||
value_t<u64> addr = eval(zext<u64>(extract(x1, 3) & 0x3fff0) + ((get_imm<u64>(op.si10) << 4) + splat<u64>(data._u32[3] & 0x3fff0)));
|
||||
set_vr(op.rt, make_load_ls(addr));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
value_t<u64> addr = eval(zext<u64>(extract(a, 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
|
||||
set_vr(op.rt, make_load_ls(addr));
|
||||
}
|
||||
|
||||
|
||||
@ -56,6 +56,7 @@ constexpr u32 spu_decode(u32 inst)
|
||||
}
|
||||
|
||||
std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op);
|
||||
std::tuple<u32, std::array<u32, 3>, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op);
|
||||
|
||||
// SPU decoder object. D provides functions. T is function pointer type returned.
|
||||
template <typename D, typename T = decltype(&D::UNK)>
|
||||
|
||||
@ -495,7 +495,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write);
|
||||
|
||||
extern thread_local u64 g_tls_fault_spu;
|
||||
|
||||
const spu_decoder<spu_itype> s_spu_itype;
|
||||
const extern spu_decoder<spu_itype> g_spu_itype;
|
||||
const extern spu_decoder<spu_iflag> g_spu_iflag;
|
||||
|
||||
namespace vm
|
||||
{
|
||||
@ -598,7 +599,7 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
|
||||
{
|
||||
std::array<u32, 2> res{spu_branch_target(pc + 4), umax};
|
||||
|
||||
switch (const auto type = s_spu_itype.decode(op.opcode))
|
||||
switch (const auto type = g_spu_itype.decode(op.opcode))
|
||||
{
|
||||
case spu_itype::BR:
|
||||
case spu_itype::BRA:
|
||||
@ -639,6 +640,54 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
|
||||
return res;
|
||||
}
|
||||
|
||||
std::tuple<u32, std::array<u32, 3>, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op)
|
||||
{
|
||||
std::tuple<u32, std::array<u32, 3>, u32> result{u32{umax}, std::array<u32, 3>{128, 128, 128}, op.opcode};
|
||||
|
||||
const auto type = g_spu_itype.decode(op.opcode);
|
||||
|
||||
if (type & spu_itype::zregmod)
|
||||
{
|
||||
std::get<2>(result) = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::get<0>(result) = type & spu_itype::_quadrop ? op.rt4 : op.rt;
|
||||
|
||||
spu_opcode_t op_masked = op;
|
||||
|
||||
if (type & spu_itype::_quadrop)
|
||||
{
|
||||
op_masked.rt4 = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
op_masked.rt = 0;
|
||||
}
|
||||
|
||||
std::get<2>(result) = op_masked.opcode;
|
||||
|
||||
if (auto iflags = g_spu_iflag.decode(op.opcode))
|
||||
{
|
||||
if (+iflags & +spu_iflag::use_ra)
|
||||
{
|
||||
std::get<1>(result)[0] = op.ra;
|
||||
}
|
||||
|
||||
if (+iflags & +spu_iflag::use_rb)
|
||||
{
|
||||
std::get<1>(result)[1] = op.rb;
|
||||
}
|
||||
|
||||
if (+iflags & +spu_iflag::use_rc)
|
||||
{
|
||||
std::get<1>(result)[2] = op.rc;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void spu_int_ctrl_t::set(u64 ints)
|
||||
{
|
||||
// leave only enabled interrupts
|
||||
@ -988,7 +1037,7 @@ std::vector<std::pair<u32, u32>> spu_thread::dump_callstack_list() const
|
||||
passed[i / 4] = true;
|
||||
|
||||
const spu_opcode_t op{_ref<u32>(i)};
|
||||
const auto type = s_spu_itype.decode(op.opcode);
|
||||
const auto type = g_spu_itype.decode(op.opcode);
|
||||
|
||||
if (start == 0 && type == spu_itype::STQD && op.ra == 1u && op.rt == 0u)
|
||||
{
|
||||
@ -3761,7 +3810,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
|
||||
|
||||
const u32 addr0 = spu_branch_target(addr);
|
||||
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls_ptr, addr0 - base_addr)};
|
||||
const auto type = s_spu_itype.decode(op.opcode);
|
||||
const auto type = g_spu_itype.decode(op.opcode);
|
||||
|
||||
if (type == spu_itype::UNK || !op.opcode)
|
||||
{
|
||||
@ -3907,7 +3956,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
|
||||
// Test the validity of a single instruction of the optional target
|
||||
// This function can't be too slow and is unlikely to improve results by a great deal
|
||||
const u32 op0 = read_from_ptr<be_t<u32>>(ls_ptr, route_pc - base_addr);
|
||||
const spu_itype::type type0 = s_spu_itype.decode(op0);
|
||||
const spu_itype::type type0 = g_spu_itype.decode(op0);
|
||||
|
||||
if (type0 == spu_itype::UNK || !op0)
|
||||
{
|
||||
@ -6878,7 +6927,7 @@ spu_exec_object spu_thread::capture_memory_as_elf(std::span<spu_memory_segment_d
|
||||
const u32 op = read_from_ptr<be_t<u32>>(all_data, pc0 - 4);
|
||||
|
||||
// Try to find function entry (if they are placed sequentially search for BI $LR of previous function)
|
||||
if (!op || op == 0x35000000u || s_spu_itype.decode(op) == spu_itype::UNK)
|
||||
if (!op || op == 0x35000000u || g_spu_itype.decode(op) == spu_itype::UNK)
|
||||
{
|
||||
if (is_exec_code(pc0, { all_data.data(), SPU_LS_SIZE }))
|
||||
break;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user