From 0a12e482f4e7c24313b4751da2eeabfbde96d641 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Tue, 10 Feb 2026 18:45:17 +0200 Subject: [PATCH] SPU LLVM: Implement reduced loop --- rpcs3/Emu/CPU/CPUTranslator.h | 9 + rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 2 + rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 576 ++++++++++++++++++++++++- rpcs3/Emu/Cell/SPUOpcodes.h | 1 + rpcs3/Emu/Cell/SPUThread.cpp | 61 ++- 5 files changed, 639 insertions(+), 10 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 9b9804fd39..4a0028c571 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3951,6 +3951,15 @@ public: erase_stores({args.value...}); } + // Debug breakpoint + void debugtrap() + { + const auto _rty = llvm::Type::getVoidTy(m_context); + const auto type = llvm::FunctionType::get(_rty, {}, false); + const auto func = llvm::cast(m_ir->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction("llvm.debugtrap", type).getCallee()); + m_ir->CreateCall(func); + } + template static auto pshufb(T&& a, U&& b) { diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 4b52ce8c43..eff272753a 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -8544,6 +8544,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (inst_attr attr = m_inst_attrs[(loop_pc - entry_point) / 4]; attr == inst_attr::none) { + const u64 hash = loop_pc / 4 + read_from_ptr>(func_hash.data()); + add_pattern(inst_attr::reduced_loop, loop_pc - result.entry_point, 0, std::make_shared(pattern)); std::string regs = "{"; diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 856a039e5e..103f48085d 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -139,7 +139,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator spu_recompiler_base::block_info* bb{}; // Current block's entry block - llvm::BasicBlock* block; + llvm::BasicBlock* block{}; // Final block (for PHI nodes, set after completion) llvm::BasicBlock* block_end{}; @@ -155,6 +155,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Store instructions std::array store{}; + bool block_wide_reg_store_elimination = false; // Store reordering/elimination protection std::array store_context_last_id = fill_array(0); // Protects against illegal forward ordering @@ -364,7 +365,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) { - m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); + m_block->reg[i] = get_reg_fixed(i, get_reg_type(i)); } } @@ -709,6 +710,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!reg) { + if (m_block && m_block->block_wide_reg_store_elimination) + { + fmt::throw_exception("Unexpected load: [%s] at 0x%x (gpr=r%d)", m_hash, m_pos, index); + } + // Load register value if necessary reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index)); } @@ -920,6 +926,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (m_block) { + if (m_block->block_wide_reg_store_elimination) + { + // Don't save registers for the current block iteration + // Affected optimizations: + // 1. Single-block reduced loop + return; + } + // Keep the store's location in history of gpr preservaions m_block->store_context_last_id[index] = m_block->store_context_ctr[index]; m_block->store_context_first_id[index] = std::min(m_block->store_context_first_id[index], m_block->store_context_ctr[index]); @@ -2059,6 +2073,43 @@ public: bool need_check = false; m_block->bb = &bb; + // [1gJ45f2-0x00a40]: 16.4982% (113258) + // [ZsQTud1-0x0924c]: 6.1202% (42014) + // [ZsQTud1-0x08e54]: 5.6610% (38862) + // [0000000-0x3fffc]: 4.3764% (30043) + // [Zh4tpJM-0x00bcc]: 3.7908% (26023) + // [CFt8hXu-0x063b8]: 3.6177% (24835) + // [8YJCUjv-0x0ad18]: 3.2417% (22254) + // [Try3XHn-0x0f018]: 2.3721% (16284) + // [s6ti9iu-0x07678]: 1.8464% (12675) + // [oyxkAPv-0x0c22c]: 1.7776% (12203) + // [Q0jLqH4-0x00324]: 1.6015% (10994) + static const std::array, 4> to_nop + { + { } + }; + + bool found_block = false; + + for (auto& [hash, pos] : to_nop) + { + if (m_hash.find(hash) <= 2 && baddr == pos) + { + found_block = true; + break; + } + } + + if (found_block) + { + for (u32 i = 0; i < 100; i++) + { + auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::last_getllar_lsa)); + auto mod_val = m_ir->CreateFDiv(value, llvm::ConstantFP::get(value->getType(), 1.1 + i)); + m_ir->CreateStore(value, spu_ptr(&spu_thread::last_getllar_lsa)); + } + } + if (!bb.preds.empty()) { // Initialize registers and build PHI nodes if necessary @@ -2174,6 +2225,485 @@ public: check_state(baddr); } + const bool is_reduced_loop = m_inst_attrs[(baddr - start) / 4] == inst_attr::reduced_loop; + const auto reduced_loop_info = is_reduced_loop ? std::static_pointer_cast(ensure(m_patterns.at(baddr - start).info_ptr)) : nullptr; + + BasicBlock* block_optimization_phi_parent = nullptr; + const auto block_optimization_inner = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-it-0x%x", m_pos), m_function) : nullptr; + const auto block_optimization_exit_early = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-exit-0x%x", m_pos), m_function) : nullptr; + const auto block_optimization_next = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b2-0x%x", m_pos), m_function) : nullptr; + + std::array reduced_loop_phi_nodes{}; + std::array reduced_loop_init_regs{}; + + auto make_reduced_loop_condition = [&](llvm::BasicBlock* optimization_block, bool is_second_time, u32 reserve_iterations) + { + llvm::ICmpInst::Predicate compare{}; + + switch (reduced_loop_info->cond_val_compare) + { + case CMP_SLESS: compare = ICmpInst::ICMP_SLT; break; + case CMP_SGREATER: compare = ICmpInst::ICMP_SGT; break; + case CMP_EQUAL: compare = ICmpInst::ICMP_EQ; break; + case CMP_LLESS: compare = ICmpInst::ICMP_ULT; break; + case CMP_LGREATER: compare = ICmpInst::ICMP_UGT; break; + case CMP_SGREATER_EQUAL: compare = ICmpInst::ICMP_SGE; break; + case CMP_SLOWER_EQUAL: compare = ICmpInst::ICMP_SLE; break; + case CMP_NOT_EQUAL: compare = ICmpInst::ICMP_NE; break; + case CMP_LGREATER_EQUAL: compare = ICmpInst::ICMP_UGE; break; + case CMP_LLOWER_EQUAL: compare = ICmpInst::ICMP_ULE; break; + { + break; + } + case CMP_UNKNOWN: + case CMP_NOT_EQUAL2: + case CMP_EQUAL2: + default: + { + ensure(false); + break; + } + } + + llvm::Value* loop_dictator_before_adjustment{}; + llvm::Value* loop_dictator_after_adjustment{}; + + spu_opcode_t reg_target{}; + reg_target.rt = reduced_loop_info->cond_val_register_idx; + + if (reg_target.rt != reduced_loop_info->cond_val_register_idx) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition register index: 0x%llx", reduced_loop_info->cond_val_register_idx); + } + + if (!m_block->reg[reg_target.rt]) + { + m_block->reg[reg_target.rt] = reduced_loop_init_regs[reg_target.rt]; + } + + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + default: + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition bit mask: 0x%llx", reduced_loop_info->cond_val_mask); + } + } + + const u32 type_bits = std::popcount(reduced_loop_info->cond_val_mask); + + llvm::Value* cond_val_incr = nullptr; + + if (reduced_loop_info->cond_val_incr_is_immediate) + { + cond_val_incr = m_ir->getIntN(type_bits, reduced_loop_info->cond_val_incr & reduced_loop_info->cond_val_mask); + } + else + { + spu_opcode_t reg_incr{}; + reg_incr.rt = reduced_loop_info->cond_val_incr; + + if (reg_incr.rt != reduced_loop_info->cond_val_incr) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal increment arguemnt register index: 0x%llx", reduced_loop_info->cond_val_incr); + } + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + } + } + + if (reduced_loop_info->cond_val_incr_before_cond && !reduced_loop_info->cond_val_incr_before_cond_taken_in_account) + { + loop_dictator_after_adjustment = m_ir->CreateAdd(loop_dictator_before_adjustment, cond_val_incr); + } + else + { + loop_dictator_after_adjustment = loop_dictator_before_adjustment; + } + + llvm::Value* loop_argument = nullptr; + + if (reduced_loop_info->cond_val_is_immediate) + { + loop_argument = m_ir->CreateTrunc(m_ir->getInt64(reduced_loop_info->cond_val_min & reduced_loop_info->cond_val_mask), loop_dictator_before_adjustment->getType()); + } + else + { + spu_opcode_t reg_target2{}; + reg_target2.rt = reduced_loop_info->cond_val_register_argument_idx; + + if (reg_target2.rt != reduced_loop_info->cond_val_register_argument_idx) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition arguemnt register index: 0x%llx", reduced_loop_info->cond_val_register_argument_idx); + } + + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + } + } + + llvm::Value* condition = nullptr; + + if (reserve_iterations == 1) + { + condition = m_ir->CreateICmp(compare, loop_dictator_after_adjustment, loop_argument); + } + // else if ((reduced_loop_info->cond_val_compare == CMP_LGREATER || (reduced_loop_info->cond_val_compare == CMP_LGREATER_EQUAL && reduced_loop_info->cond_val_is_immediate && reduced_loop_info->cond_val_incr)) && cond_val_incr->getSExtValue() < 0) + // { + // const auto cond_val_incr_multiplied = m_ir->CreateMul(cond_val_incr, reserve_iterations - 1); + // condition = m_ir->CreateICmp(compare, select(m_ir->CreateICmpUGE(cond_val_incr_multiplied, loop_dictator_after_adjustment), m_ir->CreateAdd(loop_dictator_after_adjustment, cond_val_incr_multiplied), m_ir->getIntN(type_bits, 0)), loop_argument); + // } + else + { + //debugtrap(); + + llvm::Value* prev_it = loop_dictator_after_adjustment; + + for (u32 i = 0; i < reserve_iterations; i++) + { + if (i) + { + prev_it = m_ir->CreateAdd(prev_it, cond_val_incr); + } + + const auto also_cond = m_ir->CreateICmp(compare, prev_it, loop_argument); + condition = condition ? m_ir->CreateAnd(condition, also_cond) : also_cond; + } + } + + if (!is_second_time) + { + for (u32 i = 0, count = 0, prev_i = umax;; i++) + { + const bool is_last = !(count <= 20 && i < s_reg_max); + + if (is_last || m_block->is_gpr_not_NaN_hint(i)) + { + count++; + + if (prev_i == umax) + { + if (!is_last) + { + prev_i = i; + continue; + } + + break; + } + + auto access_gpr = [&](u32 index) + { + spu_opcode_t op_arg{}; + op_arg.ra = index; + return get_vr(op_arg.ra); + }; + + // OR LSB to convert infinity to NaN + llvm::Value* arg1 = bitcast(access_gpr(prev_i) | splat(1)).eval(m_ir); + llvm::Value* arg2 = is_last ? arg1 : bitcast(access_gpr(i) | splat(1)).eval(m_ir); + + llvm::Value* acc = m_ir->CreateSExt(m_ir->CreateFCmpUNO(arg1, arg2), get_type()); + + // Pattern for PTEST + acc = m_ir->CreateBitCast(acc, get_type()); + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u64 i = 1; i < 2; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + // Compare result with zero + const auto cond_nans = m_ir->CreateICmpEQ(elem, m_ir->getInt64(0)); + condition = m_ir->CreateAnd(cond_nans, condition); + prev_i = umax; + } + } + } + + //condition = m_ir->getInt1(0); + + m_ir->CreateCondBr(condition, optimization_block, block_optimization_next); + }; + + if (is_reduced_loop) + { + for (u32 i = 0; i < s_reg_max; i++) + { + llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + if (i < reduced_loop_info->loop_dicts.size() && (reduced_loop_info->loop_dicts.test(i) || reduced_loop_info->loop_writes.test(i))) + { + // Connect registers which are used and then modified by the block + auto value = m_block->reg[i]; + + if (!value || value->getType() != type) + { + value = get_reg_fixed(i, type); + } + + reduced_loop_init_regs[i] = value; + } + else if (i < reduced_loop_info->loop_dicts.size() && reduced_loop_info->loop_args.test(i)) + { + // Load registers used as arguments of the loop + if (!m_block->reg[i]) + { + m_block->reg[i] = get_reg_fixed(i, type); + } + } + } + + const auto prev_insert_block = m_ir->GetInsertBlock(); + + block_optimization_phi_parent = prev_insert_block; + + make_reduced_loop_condition(block_optimization_inner, false, 2); + m_ir->SetInsertPoint(block_optimization_inner); + + for (u32 i = 0; i < s_reg_max; i++) + { + if (auto init_val = reduced_loop_init_regs[i]) + { + llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + const auto _phi = m_ir->CreatePHI(init_val->getType(), 2, fmt::format("reduced_0x%05x_r%u", baddr, i)); + _phi->addIncoming(init_val, prev_insert_block); + + reduced_loop_phi_nodes[i] = _phi; + m_block->reg[i] = _phi; + } + } + + m_block->block_wide_reg_store_elimination = true; + } + + // Instructions emitting optimizations: Loop iteration is not the last + m_pos = baddr; + + // Masked opcodde -> register modification times + std::map>> masked_times; + std::array reg_states{}; + u32 s_reg_state{1}; + + for (u32 iteration_emit = 0; is_reduced_loop; m_pos += 4) + { + if (m_pos != baddr && m_block_info[m_pos / 4] && reduced_loop_info->loop_end < m_pos) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(1) too early at 0x%x", m_pos); + } + + if (!(m_pos >= start && m_pos < end)) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(2) too early at 0x%x", m_pos); + } + + if (m_ir->GetInsertBlock()->getTerminator()) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(3) too early at 0x%x", m_pos); + } + + const u32 op = std::bit_cast>(func.data[(m_pos - start) / 4]); + const auto itype = g_spu_itype.decode(op); + + if (itype & spu_itype::branch) + { + bool branches_back = false; + + for (u32 dest : op_branch_targets(m_pos, spu_opcode_t{op})) + { + branches_back = branches_back || dest == baddr; + } + + if (!branches_back) + { + continue; + } + + iteration_emit++; + + if (iteration_emit < 2) + { + // Reset mpos (with fixup) + m_pos = baddr - 4; + continue; + } + + // Optimization block body + const auto block_inner = m_ir->GetInsertBlock(); + + std::array block_reg_results{}; + + for (u32 i = 0; i < s_reg_max; i++) + { + if (auto phi = reduced_loop_phi_nodes[i]) + { + const auto type = phi->getType() == get_type() ? get_type() : get_reg_type(i); + block_reg_results[i] = ensure(get_reg_fixed(i, type)); + phi->addIncoming(block_reg_results[i], block_inner); + } + } + + ensure(!!m_block->reg[reduced_loop_info->cond_val_register_idx]); + make_reduced_loop_condition(block_optimization_inner, true, 2); + m_ir->SetInsertPoint(block_optimization_next); + m_block->block_wide_reg_store_elimination = false; + + for (u32 i = 0; i < s_reg_max; i++) + { + if (const auto loop_value = block_reg_results[i]) + { + const auto phi = m_ir->CreatePHI(loop_value->getType(), 2, fmt::format("redres_0x%05x_r%u", baddr, i)); + + phi->addIncoming(loop_value, block_inner); + phi->addIncoming(reduced_loop_init_regs[i], block_optimization_phi_parent); + m_block->reg[i] = phi; + } + } + + + break; + } + + if (!op) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: [%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]); + } + + const auto [reg_rt, reg_access, masked_op] = op_register_targets(m_pos, spu_opcode_t{op}); + + bool erased = false; + + const auto inst_times = std::array{reg_states[reg_access[0]], reg_states[reg_access[1]], reg_states[reg_access[2]]}; + + // Try to reuse the reult of the previous iteration (if argumnent registers have not been modified) + if (reg_rt < 128 && masked_times.count(masked_op) && masked_times[masked_op].first && m_inst_attrs[(m_pos - start) / 4] == inst_attr::none) + { + auto times = masked_times[masked_op].second; + + bool is_ok = true; + for (u32 regi = 0; regi < 3; regi++) + { + if (reg_access[regi] < 128 && times[regi] != inst_times[regi]) + { + is_ok = false; + } + } + + if (is_ok) + { + m_block->reg[reg_rt] = masked_times[masked_op].first; + erased = true; + } + } + + if (reg_rt < 128) + { + reg_states[reg_rt] = s_reg_state++; + } + + if (erased) + { + continue; + } + + m_next_op = 0; + + masked_times[masked_op] = {}; + + switch (m_inst_attrs[(m_pos - start) / 4]) + { + case inst_attr::putllc0: + { + putllc0_pattern(func, m_patterns.at(m_pos - start).info); + continue; + } + case inst_attr::putllc16: + { + putllc16_pattern(func, m_patterns.at(m_pos - start).info); + continue; + } + case inst_attr::omit: + { + // TODO + continue; + } + default: break; + } + + // Execute recompiler function (TODO) + (this->*decode(op))({op}); + + if (reg_rt < 128 && itype & spu_itype::pure && reg_rt != reg_access[0] && reg_rt != reg_access[1] && reg_rt != reg_access[2]) + { + masked_times[masked_op] = {ensure(m_block->reg[reg_rt]), inst_times}; + } + } + // Emit instructions for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4) { @@ -7824,13 +8354,51 @@ public: } } - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); + const auto a = get_vr(op.ra); + + if (auto [ok, x, y] = match_expr(a, match() + match()); ok) + { + if (auto [ok1, data] = get_const_vector(x.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(y, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + make_store_ls(addr, get_vr(op.rt)); + return; + } + + if (auto [ok2, data] = get_const_vector(y.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(x, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + make_store_ls(addr, get_vr(op.rt)); + return; + } + } + + value_t addr = eval(zext(extract(a, 3) & 0x3fff0) + (get_imm(op.si10) << 4)); make_store_ls(addr, get_vr(op.rt)); } void LQD(spu_opcode_t op) { - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); + const auto a = get_vr(op.ra); + + if (auto [ok, x1, y1] = match_expr(a, match() + match()); ok) + { + if (auto [ok1, data] = get_const_vector(x1.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(y1, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + set_vr(op.rt, make_load_ls(addr)); + return; + } + + if (auto [ok2, data] = get_const_vector(y1.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(x1, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + set_vr(op.rt, make_load_ls(addr)); + return; + } + } + + value_t addr = eval(zext(extract(a, 3) & 0x3fff0) + (get_imm(op.si10) << 4)); set_vr(op.rt, make_load_ls(addr)); } diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h index 93e56e48da..42d76792a2 100644 --- a/rpcs3/Emu/Cell/SPUOpcodes.h +++ b/rpcs3/Emu/Cell/SPUOpcodes.h @@ -56,6 +56,7 @@ constexpr u32 spu_decode(u32 inst) } std::array op_branch_targets(u32 pc, spu_opcode_t op); +std::tuple, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op); // SPU decoder object. D provides functions. T is function pointer type returned. template diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index b9a77d7696..e4346bc3df 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -495,7 +495,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write); extern thread_local u64 g_tls_fault_spu; -const spu_decoder s_spu_itype; +const extern spu_decoder g_spu_itype; +const extern spu_decoder g_spu_iflag; namespace vm { @@ -598,7 +599,7 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) { std::array res{spu_branch_target(pc + 4), umax}; - switch (const auto type = s_spu_itype.decode(op.opcode)) + switch (const auto type = g_spu_itype.decode(op.opcode)) { case spu_itype::BR: case spu_itype::BRA: @@ -639,6 +640,54 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) return res; } +std::tuple, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op) +{ + std::tuple, u32> result{u32{umax}, std::array{128, 128, 128}, op.opcode}; + + const auto type = g_spu_itype.decode(op.opcode); + + if (type & spu_itype::zregmod) + { + std::get<2>(result) = 0; + return result; + } + + std::get<0>(result) = type & spu_itype::_quadrop ? op.rt4 : op.rt; + + spu_opcode_t op_masked = op; + + if (type & spu_itype::_quadrop) + { + op_masked.rt4 = 0; + } + else + { + op_masked.rt = 0; + } + + std::get<2>(result) = op_masked.opcode; + + if (auto iflags = g_spu_iflag.decode(op.opcode)) + { + if (+iflags & +spu_iflag::use_ra) + { + std::get<1>(result)[0] = op.ra; + } + + if (+iflags & +spu_iflag::use_rb) + { + std::get<1>(result)[1] = op.rb; + } + + if (+iflags & +spu_iflag::use_rc) + { + std::get<1>(result)[2] = op.rc; + } + } + + return result; +} + void spu_int_ctrl_t::set(u64 ints) { // leave only enabled interrupts @@ -988,7 +1037,7 @@ std::vector> spu_thread::dump_callstack_list() const passed[i / 4] = true; const spu_opcode_t op{_ref(i)}; - const auto type = s_spu_itype.decode(op.opcode); + const auto type = g_spu_itype.decode(op.opcode); if (start == 0 && type == spu_itype::STQD && op.ra == 1u && op.rt == 0u) { @@ -3761,7 +3810,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span ls_ptr, u32 base_add const u32 addr0 = spu_branch_target(addr); const spu_opcode_t op{read_from_ptr>(ls_ptr, addr0 - base_addr)}; - const auto type = s_spu_itype.decode(op.opcode); + const auto type = g_spu_itype.decode(op.opcode); if (type == spu_itype::UNK || !op.opcode) { @@ -3907,7 +3956,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span ls_ptr, u32 base_add // Test the validity of a single instruction of the optional target // This function can't be too slow and is unlikely to improve results by a great deal const u32 op0 = read_from_ptr>(ls_ptr, route_pc - base_addr); - const spu_itype::type type0 = s_spu_itype.decode(op0); + const spu_itype::type type0 = g_spu_itype.decode(op0); if (type0 == spu_itype::UNK || !op0) { @@ -6878,7 +6927,7 @@ spu_exec_object spu_thread::capture_memory_as_elf(std::span>(all_data, pc0 - 4); // Try to find function entry (if they are placed sequentially search for BI $LR of previous function) - if (!op || op == 0x35000000u || s_spu_itype.decode(op) == spu_itype::UNK) + if (!op || op == 0x35000000u || g_spu_itype.decode(op) == spu_itype::UNK) { if (is_exec_code(pc0, { all_data.data(), SPU_LS_SIZE })) break;