From 452a9c42c952c4f0bd192156772c89a393b9ada2 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Sat, 13 Dec 2025 17:20:07 +0200 Subject: [PATCH 01/11] SPU: Detect reduced loop --- rpcs3/Emu/Cell/SPUAnalyser.h | 4 + rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 1324 +++++++++++++++++++++++- rpcs3/Emu/Cell/SPUOpcodes.h | 14 + rpcs3/Emu/Cell/SPURecompiler.h | 426 +++++++- 4 files changed, 1726 insertions(+), 42 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index 123a629bed..59560cd182 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -303,6 +303,8 @@ struct spu_itype } }; +using spu_itype_t = spu_itype::type; + struct spu_iflag { enum @@ -528,6 +530,8 @@ struct spu_iflag } }; +using spu_iflag_t = spu_iflag::flag; + #define NAME(x) static constexpr const char& x = *#x struct spu_iname diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 1b6003036b..4b52ce8c43 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -52,6 +52,36 @@ struct span_less template inline constexpr span_less s_span_less{}; +template <> +void fmt_class_string::format(std::string& out, u64 arg) +{ + format_enum(out, arg, [](spu_recompiler_base::compare_direction arg) + { + switch (arg) + { + case spu_recompiler_base::CMP_SLESS: return "SLT"; + case spu_recompiler_base::CMP_SGREATER: return "SGT"; + case spu_recompiler_base::CMP_EQUAL: return "IEQ"; + case spu_recompiler_base::CMP_LLESS: return "ULT"; + case spu_recompiler_base::CMP_LGREATER: return "UGT"; + case spu_recompiler_base::CMP_SGREATER_EQUAL: return "SGE"; + case spu_recompiler_base::CMP_SLOWER_EQUAL: return "SLE"; + case spu_recompiler_base::CMP_NOT_EQUAL: return "INE"; + case spu_recompiler_base::CMP_LGREATER_EQUAL: return "UGE"; + case spu_recompiler_base::CMP_LLOWER_EQUAL: return "ULE"; + case spu_recompiler_base::CMP_UNKNOWN: + case spu_recompiler_base::CMP_NOT_EQUAL2: + case spu_recompiler_base::CMP_EQUAL2: + default: + { + break; + } + } + + return unknown; + }); +} + // Move 4 args for calling native function from a GHC calling convention function #if defined(ARCH_X64) static u8* move_args_ghc_to_native(u8* raw) @@ -3002,7 +3032,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s is_no_return = is_no_return || (op_next.rb >= 4 && op_next.rb < 10); } - if (type_next & spu_itype::_quadrop && +iflags & +spu_iflag::use_rc) + if (+iflags & +spu_iflag::use_rc) { is_no_return = is_no_return || (op_next.ra >= 4 && op_next.rb < 10); } @@ -3308,7 +3338,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s is_no_return = is_no_return || (op_next.rb >= 4 && op_next.rb < 10); } - if (type_next & spu_itype::_quadrop && +iflags & +spu_iflag::use_rc) + if (+iflags & +spu_iflag::use_rc) { is_no_return = is_no_return || (op_next.rc >= 4 && op_next.rc < 10); } @@ -3834,17 +3864,25 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s continue; } + bool removed = false; + for (auto it2 = it->second.begin(); it2 != it->second.end();) { if (*it2 < lsa || *it2 >= limit) { it2 = it->second.erase(it2); + removed = true; continue; } it2++; } + if (removed) + { + it->second.emplace_back(SPU_LS_SIZE); + } + it++; } @@ -4863,19 +4901,24 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s return map; }; - struct putllc16_statistics_t + struct stats_t { atomic_t all = 0; atomic_t single = 0; - atomic_t nowrite = 0; std::array, 128> breaking_reason{}; }; - struct rchcnt_statistics_t + struct putllc16_statistics_t : stats_t + { + atomic_t nowrite = 0; + }; + + struct rchcnt_statistics_t : stats_t + { + }; + + struct reduced_statistics_t : stats_t { - atomic_t all = 0; - atomic_t single = 0; - std::array, 128> breaking_reason{}; }; // Pattern structures @@ -4987,6 +5030,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // RDCH/RCHCNT Loop analysis tracker rchcnt_loop_t rchcnt_loop{}; + reduced_loop_t reduced_loop{}; + block_reg_state_iterator(u32 _pc, usz _parent_iterator_index = umax, usz _parent_target_index = 0) noexcept : pc(_pc) , parent_iterator_index(_parent_iterator_index) @@ -4999,6 +5044,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s std::map atomic16_all; // RdAtomicStat location -> atomic loop optimization state std::map rchcnt_loop_all; // RDCH/RCHCNT location -> channel read loop optimization state + std::map reduced_loop_all; std::map getllar_starts; // True for failed loops std::map run_on_block; std::map logged_block; @@ -5007,6 +5053,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s atomic16_t dummy16{}; rchcnt_loop_t dummy_loop{}; + reduced_loop_t dummy_rloop{}; bool likely_putllc_loop = false; bool had_putllc_evaluation = false; @@ -5053,6 +5100,194 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s u32 iterator_id_alloc = 0; + auto get_block_targets = [&](u32 pc) -> std::span + { + if (m_block_info[pc / 4] && m_bbs.count(pc)) + { + return ::at32(m_bbs, pc).targets; + } + + return {}; + }; + + auto get_block_preds = [&](u32 pc) -> std::span + { + if (m_block_info[pc / 4] && m_bbs.count(pc)) + { + return ::at32(m_bbs, pc).preds; + } + + return {}; + }; + + const auto initiate_patterns = [&](block_reg_state_iterator& block_state_it, u32 bpc, bool is_multi_block) + { + // Initiate patterns (that are initiated on block start) + const auto& bb_body = ::at32(m_bbs, bpc); + + bool invalid = bb_body.size <= 2; + bool valid = true; + + u32 expected_sup_conds = 0; + u32 first_pred_of_loop = SPU_LS_SIZE; + + for (u32 pred : get_block_preds(bpc)) + { + if (is_multi_block ? pred >= bpc : pred == bpc) + { + first_pred_of_loop = std::min(pred, first_pred_of_loop); + } + } + + valid = first_pred_of_loop != SPU_LS_SIZE; + + const auto& bb_connect = ::at32(m_bbs, valid ? first_pred_of_loop : bpc); + + invalid = invalid || !valid; + valid = false; + + // Check loop connector block (must jump to block-next or to loop-start) + u32 targets_count = 0; + + for (u32 target : get_block_targets(first_pred_of_loop)) + { + valid = true; + targets_count++; + + if (first_pred_of_loop == bpc) + { + continue; + } + + if (target != bpc) + { + if (target != first_pred_of_loop + bb_connect.size * 4) + { + invalid = true; + } + } + } + + if (targets_count > 2) + { + invalid = true; + } + + const bool is_two_block_loop = targets_count == 1; + + invalid = invalid || !valid; + valid = false; + + // Check loop body block (must jump to last-block or another location) + + for (u32 block_pc = bpc; !invalid;) + { + targets_count = 0; + + const u32 cond_next = block_pc + ::at32(m_bbs, block_pc).size * 4; + valid = false; + + bool is_end = false; + + for (u32 target : get_block_targets(block_pc)) + { + targets_count++; + + if (target == cond_next) + { + // Conditional branch + valid = true; + } + + if (target <= block_pc && target > bpc) + { + // Branch backwards + invalid = true; + } + + if (target == bpc) + { + is_end = true; + } + } + + // if (bpc != block_pc) + // { + // for (u32 pred : get_block_preds(block_pc)) + // { + // if (pred < bpc || pred > first_pred_of_loop + ::at32(m_bbs, first_pred_of_loop).size * 4) + // { + // invalid = true; + // break; + // } + // } + // } + + if (targets_count > 2) + { + invalid = true; + break; + } + + if (cond_next == first_pred_of_loop && is_two_block_loop) + { + valid = true; + break; + } + + if (!valid) + { + break; + } + + if (bpc == first_pred_of_loop || is_end) + { + break; + } + + if (targets_count == 2) + { + expected_sup_conds++; + } + + block_pc = cond_next; + } + + invalid = invalid || !valid; + + if (bb_body.size > 2 && !invalid) + { + // Early filtering of false positives + const spu_opcode_t op{std::bit_cast>(::at32(result.data, (bpc - entry_point) / 4 + bb_body.size - 2))}; + const spu_opcode_t op2{std::bit_cast>(::at32(result.data, (bpc - entry_point) / 4))}; + + switch (g_spu_itype.decode(op.opcode)) + { + case spu_itype::RDCH: invalid = op.ra != SPU_RdDec; break; + case spu_itype::RCHCNT: invalid = true; break; + default: break; + } + + switch (g_spu_itype.decode(op2.opcode)) + { + case spu_itype::RDCH: invalid = invalid || op2.ra != SPU_RdDec; break; + case spu_itype::RCHCNT: invalid = true; break; + default: break; + } + } + + if (valid && !invalid && !reduced_loop_all.count(bpc) && expected_sup_conds == 0) + { + const auto reduced_loop = &block_state_it.reduced_loop; + reduced_loop->discard(); + reduced_loop->active = true; + reduced_loop->loop_pc = bpc; + reduced_loop->loop_end = first_pred_of_loop; + reduced_loop->expected_sup_conds = expected_sup_conds; + reduced_loop->is_two_block_loop = is_two_block_loop; + } + }; + for (u32 wf = 0, wi = 0, wa = entry_point, bpc = wa; wf <= 1;) { const bool is_form_block = wf == 0; @@ -5121,6 +5356,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s auto& vregs = is_form_block ? infos[bpc]->local_state : *true_state_walkby; const auto atomic16 = is_pattern_match ? &::at32(reg_state_it, wi).atomic16 : &dummy16; const auto rchcnt_loop = is_pattern_match ? &::at32(reg_state_it, wi).rchcnt_loop : &dummy_loop; + const auto reduced_loop = &::at32(reg_state_it, wi).reduced_loop; const u32 pos = wa; @@ -5244,10 +5480,71 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } }; + const auto break_reduced_loop_pattern = [&](u32 cause, reduced_loop_t previous) + { + if (previous.active && previous.loop_pc != SPU_LS_SIZE && reduced_loop_all.count(previous.loop_pc) == 0) + { + g_fxo->get().breaking_reason[cause]++; + + if (!spu_log.notice) + { + return; + } + + previous.active = false; + previous.failed = true; + + reduced_loop_all[previous.loop_pc] = previous; + + std::string break_error = fmt::format("Reduced loop pattern breakage [%x cause=%u] (read_pc=0x%x)", pos, cause, previous.loop_pc); + + const auto values = sort_breakig_reasons(g_fxo->get().breaking_reason); + + std::string tracing = "Top Breaking Reasons:"; + + usz i = 0; + usz fail_count = 0; + bool switched_to_minimal = false; + + for (auto it = values.begin(); it != values.end(); i++, it++) + { + fail_count += it->second; + + if (i >= 12) + { + continue; + } + + if (i < 8 && it->second > 1) + { + fmt::append(tracing, " [cause=%u, n=%d]", it->first, it->second); + } + else + { + if (!std::exchange(switched_to_minimal, true)) + { + fmt::append(tracing, "; More:"); + } + + fmt::append(tracing, " %u", it->first); + } + } + + fmt::append(tracing, " of %d failures", fail_count); + spu_log.notice("%s\n%s", break_error, tracing); + + std::string block_dump; + this->dump(result, block_dump, previous.loop_pc, previous.loop_end + 1); + + spu_log.notice("SPU Block Dump:\n%s", block_dump); + } + }; + const auto break_all_patterns = [&](u32 cause) { break_putllc16(cause, atomic16->discard()); break_channel_pattern(cause, rchcnt_loop->discard()); + break_reduced_loop_pattern(cause, reduced_loop->discard()); }; const auto calculate_absolute_ls_difference = [](u32 addr1, u32 addr2) @@ -5309,16 +5606,6 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s u32 stackframe_pc = SPU_LS_SIZE; usz entry_index = umax; - auto get_block_targets = [&](u32 pc) -> std::span - { - if (m_block_info[pc / 4] && m_bbs.count(pc)) - { - return m_bbs.at(pc).targets; - } - - return {}; - }; - u32 target_pc = SPU_LS_SIZE; bool insert_entry = false; bool is_code_backdoor = false; @@ -5508,7 +5795,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } - const u32 previous_pc = m_bbs.at(reg_state_it[stackframe_it].pc).size * 4 + reg_state_it[stackframe_it].pc - 4; + const u32 previous_pc = ::at32(m_bbs, reg_state_it[stackframe_it].pc).size * 4 + reg_state_it[stackframe_it].pc - 4; bool may_return = previous_pc + 4 != entry_point + result.data.size() * 4 && (m_ret_info[(previous_pc / 4) + 1] || m_entry_info[previous_pc / 4]); @@ -5537,6 +5824,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // Backup analyser information const auto atomic16_info = reg_state_it[stackframe_it].atomic16; const auto rchcnt_loop_info = reg_state_it[stackframe_it].rchcnt_loop; + const auto reduced_loop_info = reg_state_it[stackframe_it].reduced_loop; // Clean from the back possible because it does not affect old indices // Technically should always do a full cleanup at the moment @@ -5562,6 +5850,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s spu_log.trace("Emplacing: block_id=%d, pc=0x%x, target_it=%d/%d, new_pc=0x%x (has_it=%d)", reg_state_it[stackframe_it].iterator_id, stackframe_pc, entry_index + 1, target_size, target_pc, atomic16_info.active); auto& next = reg_state_it.emplace_back(target_pc, stackframe_it, 0); + initiate_patterns(next, target_pc, true); + if (!is_code_backdoor) { // Restore analyser information (if not an entry) @@ -5569,6 +5859,9 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (previous_pc != rchcnt_loop_info.branch_pc || target_pc == rchcnt_loop_info.branch_target) next.rchcnt_loop = rchcnt_loop_info; + + if (previous_pc + 4 == target_pc && reduced_loop_info.loop_pc != reduced_loop_info.loop_end && reduced_loop_info.active && target_pc <= reduced_loop_info.loop_end) + next.reduced_loop = reduced_loop_info; } else { @@ -5604,15 +5897,30 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (!infos.empty()) { - reg_state_it.emplace_back(::at32(infos, entry_point)->pc).iterator_id = iterator_id_alloc++;; + reg_state_it.emplace_back(::at32(infos, entry_point)->pc).iterator_id = iterator_id_alloc++; + + initiate_patterns(reg_state_it.back(), ::at32(infos, entry_point)->pc, true); } } } + const auto prev_wi = wi - 1; + if (prev_wi != umax && ::at32(reg_state_it, prev_wi).reduced_loop.active) + { + const auto reduced_loop = &::at32(reg_state_it, prev_wi).reduced_loop; + + for (const auto& [reg_num, reg] : reduced_loop->regs) + { + + } + } + if (wi < reg_state_it.size()) { wa = ::at32(reg_state_it, wi).pc; bpc = wa; + + initiate_patterns(::at32(reg_state_it, wi), bpc, false); } }; @@ -5737,7 +6045,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (!is_form_block) { // Call for external code - break_all_patterns(25); + break_putllc16(25, atomic16->discard()); + break_channel_pattern(25, rchcnt_loop->discard()); } } @@ -5762,6 +6071,143 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const auto op = spu_opcode_t{data}; const auto type = g_spu_itype.decode(data); + if (reduced_loop->active && !(type & spu_itype::zregmod)) + { + const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; + + u32 ra = s_reg_max, rb = s_reg_max, rc = s_reg_max; + + if (m_use_ra.test(pos / 4)) + { + ra = op.ra; + } + + if (m_use_rb.test(pos / 4)) + { + rb = op.rb; + } + + if (m_use_rc.test(pos / 4)) + { + rc = op.rc; + } + + bool is_move_register_op = false; + + switch (type) + { + case spu_itype::SHLQBYI: + { + is_move_register_op = op.i7 == 0; + break; + } + // Technically only ORI is needed but I am taking into account possible third-party SPU compilers or hand-written assembly + case spu_itype::ORI: + case spu_itype::ORHI: + case spu_itype::ORBI: + case spu_itype::AI: + case spu_itype::AHI: + case spu_itype::XORI: + case spu_itype::XORHI: + case spu_itype::XORBI: + { + is_move_register_op = op.si10 == 0; + break; + } + case spu_itype::ANDI: + case spu_itype::ANDHI: + case spu_itype::ANDBI: + { + is_move_register_op = op.si10 == -1; + break; + } + default: + { + break; + } + } + + u32 reg_pos = SPU_LS_SIZE; + + auto org = reduced_loop->get_reg(op_rt); + + u32 reg_first = s_reg_max; + + for (u32 reg : {ra, rb, rc}) + { + if (reg != s_reg_max && reg != reg_first) + { + const auto arg = reduced_loop->find_reg(reg); + + if (arg && arg->modified >= 1) + { + reg_first = reg; + + if (reg_first != s_reg_max && !is_move_register_op) + { + // Multiple origins + org.add_instruction_modifier(spu_itype::UNK, op.opcode); + break; + } + } + } + } + + if (reg_first == s_reg_max) + { + org = {}; + + if (!is_move_register_op) + { + org.add_instruction_modifier(type, op.opcode); + } + } + else if (reg_first == rb) + { + std::swap(ra, rb); + } + else if (reg_first == rc) + { + std::swap(ra, rc); + } + + for (u32 reg : {ra, rb, rc}) + { + if (reg != s_reg_max) + { + const auto arg = reduced_loop->find_reg(reg); + + if (arg && reg != op_rt) + { + if (reg_first == reg) + { + org = *arg; + + if (!is_move_register_op) + { + org.add_instruction_modifier(type, op.opcode); + } + + continue; + } + + org.join_with_this(*arg); + } + else + { + org.add_register_origin(reg); + } + } + } + + *ensure(reduced_loop->find_reg(op_rt)) = org; + } + + if (reduced_loop->active && ((type & spu_itype::memory) || type == spu_itype::STOP || type == spu_itype::STOPD)) + { + reduced_loop->is_constant_expression = false; + } + // For debugging if (false && likely_putllc_loop && is_pattern_match) { @@ -5848,12 +6294,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - if (type == spu_itype::SYNC) - { - // Remember - sync = true; - } - + break_reduced_loop_pattern(19, reduced_loop->discard()); break; } @@ -5880,8 +6321,57 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } + case spu_itype::BR: case spu_itype::BRA: { + if (reduced_loop->active) + { + if (!reduced_loop->is_two_block_loop || !reduced_loop->has_cond_state) + { + break_reduced_loop_pattern(20, reduced_loop->discard()); + break; + } + + for (const auto& [reg_num, reg] : reduced_loop->regs) + { + if (reg.is_loop_dictator(reg_num)) + { + if (reg.is_non_predictable_loop_dictator(reg_num)) + { + //break_reduced_loop_pattern(13, reduced_loop->discard()); + reduced_loop->is_constant_expression = false; + } + + reduced_loop->loop_dicts.set(reg_num); + } + } + + for (u32 i = 0; i < s_reg_max; i++) + { + const auto& b = ::at32(m_bbs, reduced_loop->loop_pc); + const auto& b2 = ::at32(m_bbs, bpc); + + if (!reduced_loop->loop_dicts.test(i)) + { + if (b.reg_use.test(i) || (!b.reg_mod.test(i) && b2.reg_use.test(i))) + { + if ((b.reg_use.test(i) && b.reg_mod.test(i)) || b2.reg_mod.test(i)) + { + reduced_loop->is_constant_expression = false; + reduced_loop->loop_writes.set(i); + } + else + { + reduced_loop->loop_args.set(i); + } + } + } + } + + reduced_loop_all.emplace(reduced_loop->loop_pc, *reduced_loop); + reduced_loop->discard(); + } + break; } @@ -5891,7 +6381,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const u32 next_pc = spu_branch_target(pos, 1); const u32 target = spu_branch_target(pos, op.i16); - if (rchcnt_loop->active) + while (rchcnt_loop->active) { const reg_state_t& rt = vregs[op.rt]; @@ -5907,16 +6397,663 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s rchcnt_loop->conditioned = true; rchcnt_loop->branch_pc = pos; rchcnt_loop->branch_target = rchcnt_loop->product_test_negate != (type == spu_itype::BRZ) ? target : next_pc; - break; } + + break; } - break; + [[fallthrough]]; } - case spu_itype::BR: case spu_itype::BRHZ: case spu_itype::BRHNZ: { + const u32 next_pc = spu_branch_target(pos, 1); + const u32 target = spu_branch_target(pos, op.i16); + + const bool is_u16_jump = type == spu_itype::BRHZ || type == spu_itype::BRHNZ; + const bool is_jump_zero = (type == spu_itype::BRZ || type == spu_itype::BRHZ) ^ reduced_loop->is_two_block_loop; + + while (reduced_loop->active) + { + if (reduced_loop->expected_sup_conds) + { + break_reduced_loop_pattern(50, reduced_loop->discard()); + break; + } + + const u32 op_rt = op.rt; + + const auto reg = reduced_loop->find_reg(op_rt); + + if (!reg/* || reg->modified == 0*/) // See special case regarding branch with direct comparison with 0 + { + break_reduced_loop_pattern(1, reduced_loop->discard()); + break; + } + + bool should_have_argument_dictator = false; + bool should_have_argument_increment = false; + bool cond_val_incr_before_cond = false; + bool ends_with_comparison = false; + + bool pattern_ok1 = true; + + switch (reg->mod1_type) + { + case spu_itype::A: + { + should_have_argument_increment = true; + [[fallthrough]]; + } + case spu_itype::AI: + case spu_itype::AHI: + { + cond_val_incr_before_cond = true; + pattern_ok1 = true; + break; + } + case spu_itype::CEQ: + case spu_itype::CEQH: + case spu_itype::CEQB: + case spu_itype::CGT: + case spu_itype::CGTH: + case spu_itype::CGTB: + case spu_itype::CLGT: + case spu_itype::CLGTH: + case spu_itype::CLGTB: + { + ends_with_comparison = true; + should_have_argument_dictator = true; + break; + } + case spu_itype::CEQI: + case spu_itype::CEQHI: + case spu_itype::CEQBI: + case spu_itype::CGTI: + case spu_itype::CGTHI: + case spu_itype::CGTBI: + case spu_itype::CLGTI: + case spu_itype::CLGTHI: + case spu_itype::CLGTBI: + { + ends_with_comparison = true; + pattern_ok1 = true; + break; + } + default: + { + if (reg->modified == 0) + { + // Special case: target may be sourced from another register which would be the loop dictator + break; + } + + pattern_ok1 = false; + break; + } + } + + if (!pattern_ok1) + { + break_reduced_loop_pattern(9, reduced_loop->discard()); + break; + } + + if (reg->modified >= 2) + { + switch (reg->mod2_type) + { + case spu_itype::A: + { + should_have_argument_increment = true; + [[fallthrough]]; + } + case spu_itype::AI: + case spu_itype::AHI: + { + if (cond_val_incr_before_cond) + { + // AI twice + break_reduced_loop_pattern(8, reduced_loop->discard()); + pattern_ok1 = false; + break; + } + + cond_val_incr_before_cond = false; + pattern_ok1 = true; + break; + } + case spu_itype::CEQ: + case spu_itype::CEQH: + case spu_itype::CEQB: + case spu_itype::CGT: + case spu_itype::CGTH: + case spu_itype::CGTB: + case spu_itype::CLGT: + case spu_itype::CLGTH: + case spu_itype::CLGTB: + { + if (!cond_val_incr_before_cond) + { + // Double comparison + break_reduced_loop_pattern(19, reduced_loop->discard()); + pattern_ok1 = false; + break; + } + + pattern_ok1 = true; + ends_with_comparison = true; + should_have_argument_dictator = true; + break; + } + case spu_itype::CEQI: + case spu_itype::CEQHI: + case spu_itype::CEQBI: + case spu_itype::CGTI: + case spu_itype::CGTHI: + case spu_itype::CGTBI: + case spu_itype::CLGTI: + case spu_itype::CLGTHI: + case spu_itype::CLGTBI: + { + if (!cond_val_incr_before_cond) + { + // Double comparison + break_reduced_loop_pattern(19, reduced_loop->discard()); + pattern_ok1 = false; + break; + } + + ends_with_comparison = true; + pattern_ok1 = true; + break; + } + default: + { + pattern_ok1 = false; + break; + } + } + } + + if (!pattern_ok1) + { + break_reduced_loop_pattern(10, reduced_loop->discard()); + break; + } + + bool found_loop_dictator = false; + bool found_loop_argument_for_dictator = false; + u32 null_regs_found = 0; + + for (u32 i = 0; i < reg->regs.size() && reduced_loop->active; i++) + { + if (reg->regs.test(i)) + { + if (0) if (i == op_rt || reg->modified == 0) + { + // Special case: direct comparison with zero for 32-bits (the only supported form by SPU) + + if (is_jump_zero) + { + // Infinite or single-time "loop" + break_reduced_loop_pattern(3, reduced_loop->discard()); + break; + } + + if (reg->modified >= 2) + { + break_reduced_loop_pattern(22, reduced_loop->discard()); + break; + } + + reduced_loop->cond_val_mask = u32{umax}; + reduced_loop->cond_val_min = 0; + reduced_loop->cond_val_size = u32{umax}; + + auto comp_reg = i == op_rt ? reg : reduced_loop->find_reg(i); + + if (!comp_reg || !comp_reg->is_predictable_loop_dictator(i)) + { + break_reduced_loop_pattern(4, reduced_loop->discard()); + break; + } + + ensure(reg->modified == 1 || i != op_rt); + + reduced_loop->cond_val_incr = static_cast(comp_reg->IMM); + reduced_loop->cond_val_incr_before_cond = reg->modified == 1; + reduced_loop->cond_val_register_idx = i; + reduced_loop->cond_val_compare = CMP_NOT_EQUAL; + reduced_loop->cond_val_is_immediate = true; + + found_loop_dictator = true; + break; + } + + auto reg_org = reduced_loop->find_reg(i); + u32 reg_index = i; + + if (reg_org && !cond_val_incr_before_cond && reg_org->modified == 0 && reg_org->regs.count() - 1u <= 1u && !reg_org->regs.test(i)) + { + for (u32 j = 0; j <= s_reg_127; j++) + { + if (reg_org->regs.test(j)) + { + if (const auto reg_found = reduced_loop->find_reg(j)) + { + if (reg_found->modified) + { + reg_org = reg_found; + reg_index = j; + break; + } + } + } + } + } + + if (!reg_org || reg_org->is_null(reg_index)) + { + // if (found_loop_dictator && !reduced_loop->cond_val_incr_is_immediate) + // { + // ensure(reduced_loop->cond_val_incr < s_reg_max); + + // } + // if (!should_have_argument_dictator) + // { + // break_reduced_loop_pattern(11, reduced_loop->discard()); + // break; + // } + + // if (found_loop_argument_for_dictator) + // { + // break_reduced_loop_pattern(6, reduced_loop->discard()); + // break; + // } + + // found_loop_argument_for_dictator = true; + // reduced_loop->cond_val_is_immediate = false; + + // if (found_loop_dictator) + // { + // ensure(i == reduced_loop->cond_val_register_argument_idx); + // } + // else + // { + // reduced_loop->cond_val_register_argument_idx = i; + // } + + // if (found_loop_dictator && reg->regs.count() == 2) + // { + // break; + // } + + null_regs_found++; + continue; + } + + if (found_loop_dictator) + { + break_reduced_loop_pattern(13, reduced_loop->discard()); + break; + } + + found_loop_dictator = true; + + if (!reg_org->is_predictable_loop_dictator(i)) + { + break_reduced_loop_pattern(7, reduced_loop->discard()); + break; + } + + u32 cond_val_incr = static_cast(reg_org->IMM); + + if (reg_org->mod1_type == spu_itype::AI || reg_org->mod1_type == spu_itype::AHI) + { + reduced_loop->cond_val_incr_is_immediate = true; + reduced_loop->cond_val_incr = static_cast(reg_org->IMM); + } + else if (reg_org->mod1_type == spu_itype::A) + { + reduced_loop->cond_val_incr_is_immediate = false; + + const u32 op_ra = spu_opcode_t{reg->IMM}.ra; + const u32 op_rb = spu_opcode_t{reg->IMM}.rb; + + if (!(op_ra == reg_index || op_rb == reg_index)) + { + break_reduced_loop_pattern(25, reduced_loop->discard()); + break; + } + + const u32 incr_arg_reg = reg_index == op_ra ? op_rb : op_ra; + + if (!reduced_loop->is_reg_null(incr_arg_reg)) + { + break_reduced_loop_pattern(26, reduced_loop->discard()); + break; + } + + reduced_loop->cond_val_incr = incr_arg_reg; + } + else + { + break_reduced_loop_pattern(28, reduced_loop->discard()); + break; + } + + reduced_loop->cond_val_incr_before_cond = cond_val_incr_before_cond; + + u64 cmp_mask = 0; + compare_direction cmp_direction{}; + + if (!ends_with_comparison) + { + if (is_jump_zero) + { + // Infinite or single-time "loop" + break_reduced_loop_pattern(3, reduced_loop->discard()); + break; + } + + cmp_mask = is_u16_jump ? u16{umax} : u32{umax}; + reduced_loop->cond_val_min = 0; + reduced_loop->cond_val_is_immediate = true; + cmp_direction = CMP_NOT_EQUAL; + } + else if (!should_have_argument_dictator) + { + reduced_loop->cond_val_min = reg->IMM; + reduced_loop->cond_val_is_immediate = true; + + const auto cmp_optype = reg->reverse1_type() == spu_itype::XSBH ? reg->reverse2_type() : reg->reverse1_type(); + + switch (cmp_optype) + { + case spu_itype::CEQI: + case spu_itype::CEQHI: + case spu_itype::CEQBI: + { + cmp_direction = CMP_EQUAL; + break; + } + case spu_itype::CGTI: + case spu_itype::CGTHI: + case spu_itype::CGTBI: + { + cmp_direction = CMP_SGREATER; + break; + } + case spu_itype::CLGTI: + case spu_itype::CLGTHI: + case spu_itype::CLGTBI: + { + cmp_direction = CMP_LGREATER; + break; + } + default: + { + break_reduced_loop_pattern(21, reduced_loop->discard()); + } + } + + switch (cmp_optype) + { + case spu_itype::CEQI: + case spu_itype::CGTI: + case spu_itype::CLGTI: + { + cmp_mask = u32{umax}; + break; + } + case spu_itype::CLGTHI: + case spu_itype::CEQHI: + case spu_itype::CGTHI: + { + cmp_mask = u16{umax}; + break; + } + case spu_itype::CEQBI: + case spu_itype::CGTBI: + case spu_itype::CLGTBI: + { + cmp_mask = u8{umax}; + break; + } + default: break_reduced_loop_pattern(21, reduced_loop->discard()); + } + + if (is_jump_zero) + { + cmp_direction = compare_direction{cmp_direction ^ CMP_NEGATE_FLAG}; + } + + if (cmp_direction == CMP_EQUAL2 || cmp_direction == CMP_NOT_EQUAL2) + { + // Fixup (no sense in remembering the turnaround for euqality comparison) + cmp_direction = compare_direction{cmp_direction & ~CMP_TURNAROUND_FLAG}; + } + } + else + { + const u32 op_ra = spu_opcode_t{reg->IMM}.ra; + const u32 op_rb = spu_opcode_t{reg->IMM}.rb; + + if (!(op_ra == reg_index || op_rb == reg_index)) + { + break_reduced_loop_pattern(20, reduced_loop->discard()); + break; + } + + const auto cmp_optype = reg->reverse1_type() == spu_itype::XSBH ? reg->reverse2_type() : reg->reverse1_type(); + + switch (cmp_optype) + { + case spu_itype::CEQ: + case spu_itype::CEQH: + case spu_itype::CEQB: + { + cmp_direction = CMP_EQUAL; + break; + } + case spu_itype::CGT: + case spu_itype::CGTH: + case spu_itype::CGTB: + { + cmp_direction = CMP_SGREATER; + break; + } + case spu_itype::CLGT: + case spu_itype::CLGTH: + case spu_itype::CLGTB: + { + cmp_direction = CMP_LGREATER; + break; + } + default: ensure(false); + } + + switch (cmp_optype) + { + case spu_itype::CEQ: + case spu_itype::CGT: + case spu_itype::CLGT: + { + cmp_mask = u32{umax}; + break; + } + case spu_itype::CLGTH: + case spu_itype::CEQH: + case spu_itype::CGTH: + { + cmp_mask = u16{umax}; + break; + } + case spu_itype::CEQB: + case spu_itype::CGTB: + case spu_itype::CLGTB: + { + cmp_mask = u8{umax}; + break; + } + default: ensure(false); + } + + if (op_ra != i) + { + // Compare is on the oppsoite direction + // This variation exists only via register mode (due to lack of SPU opcodes) + cmp_direction = compare_direction{cmp_direction ^ CMP_TURNAROUND_FLAG}; + } + + if (is_jump_zero) + { + cmp_direction = compare_direction{cmp_direction ^ CMP_NEGATE_FLAG}; + } + + if (cmp_direction == CMP_EQUAL2 || cmp_direction == CMP_NOT_EQUAL2) + { + // Fixup (no sense in remembering the turnaround for euqality comparison) + cmp_direction = compare_direction{cmp_direction & ~CMP_TURNAROUND_FLAG}; + } + + // The loop dictator is the register that is not the argument + const u32 loop_arg_reg = reg_index == op_ra ? op_rb : op_ra; + const u32 loop_dict_reg = reg_index == op_ra ? op_ra : op_rb; + reduced_loop->cond_val_is_immediate = false; + + if (found_loop_argument_for_dictator) + { + ensure(loop_arg_reg == reduced_loop->cond_val_register_argument_idx); + } + else + { + reduced_loop->cond_val_register_argument_idx = loop_arg_reg; + } + + if (!reduced_loop->is_reg_null(loop_arg_reg)) + { + break_reduced_loop_pattern(27, reduced_loop->discard()); + break; + } + + found_loop_argument_for_dictator = true; + } + + if (cmp_direction == CMP_EQUAL) + { + // Infinite or single-time "loop" + break_reduced_loop_pattern(18, reduced_loop->discard()); + break; + } + + if (cmp_mask == u16{umax} && !is_u16_jump) + { + break_reduced_loop_pattern(14, reduced_loop->discard()); + break; + } + + if (cmp_mask == u8{umax}) + { + bool instructions_ok = false; + + if (is_u16_jump) + { + // If ANDI(0xff) is used, although unlikely, it fine as well for 16-bits + instructions_ok = FN(x == spu_itype::XSBH || x == spu_itype::ANDI)(!cond_val_incr_before_cond ? reg->mod2_type : reg->mod3_type); + } + else + { + instructions_ok = FN(x == spu_itype::ANDI)(!cond_val_incr_before_cond ? reg->mod2_type : reg->mod3_type); + } + + if (!instructions_ok) + { + break_reduced_loop_pattern(15, reduced_loop->discard()); + break; + } + } + + reduced_loop->cond_val_compare = cmp_direction; + reduced_loop->cond_val_mask = cmp_mask; + reduced_loop->cond_val_register_idx = reg_index; + + // if (!should_have_argument_dictator && reg->regs.count() == 1) + // { + // break; + // } + + // if (found_loop_argument_for_dictator && reg->regs.count() == 2) + // { + // break; + // } + } + } + + if (!found_loop_dictator) + { + break_reduced_loop_pattern(16, reduced_loop->discard()); + } + + if (should_have_argument_dictator && !found_loop_argument_for_dictator) + { + break_reduced_loop_pattern(17, reduced_loop->discard()); + } + + if (reduced_loop->active) + { + ensure(reduced_loop->cond_val_register_idx != umax); + + if (reduced_loop->is_two_block_loop) + { + reduced_loop->has_cond_state = true; + break; + } + + for (const auto& [reg_num, reg] : reduced_loop->regs) + { + if (reg.is_loop_dictator(reg_num)) + { + if (reg.is_non_predictable_loop_dictator(reg_num)) + { + //break_reduced_loop_pattern(13, reduced_loop->discard()); + reduced_loop->is_constant_expression = false; + } + + reduced_loop->loop_dicts.set(reg_num); + } + } + + for (u32 i = 0; i < s_reg_max; i++) + { + const auto& b = ::at32(m_bbs, reduced_loop->loop_pc); + const auto& b2 = ::at32(m_bbs, bpc); + + if (!reduced_loop->loop_dicts.test(i)) + { + if (b.reg_use.test(i) || (!b.reg_mod.test(i) && b2.reg_use.test(i))) + { + if ((b.reg_use.test(i) && b.reg_mod.test(i)) || b2.reg_mod.test(i)) + { + reduced_loop->is_constant_expression = false; + reduced_loop->loop_writes.set(i); + } + else + { + reduced_loop->loop_args.set(i); + } + } + } + } + + reduced_loop_all.emplace(reduced_loop->loop_pc, *reduced_loop); + reduced_loop->discard(); + } + + break; + } + break; } @@ -5929,17 +7066,49 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s case spu_itype::HLGTI: case spu_itype::LNOP: case spu_itype::NOP: - case spu_itype::MTSPR: case spu_itype::FSCRWR: { // Do nothing break; } - + + case spu_itype::MTSPR: + { + break_all_patterns(99); + break; + } + case spu_itype::WRCH: { break_channel_pattern(56, rchcnt_loop->discard()); + if (reduced_loop->active) + { + switch (op.ra) + { + case MFC_EAL: + case MFC_LSA: + case MFC_TagID: + case MFC_Size: + case MFC_EAH: + case SPU_WrDec: + case SPU_WrSRR0: + case SPU_WrEventAck: + case SPU_Set_Bkmk_Tag: + case SPU_PM_Start_Ev: + case SPU_PM_Stop_Ev: + case MFC_WrTagMask: + { + break; + } + default: + { + break_reduced_loop_pattern(18, reduced_loop->discard()); + break; + } + } + } + switch (op.ra) { case MFC_EAL: @@ -6202,6 +7371,14 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const bool is_read = type == spu_itype::RDCH; bool invalidate = true; + if (!is_read || op.ra != SPU_RdDec) + { + if (reduced_loop->active) + { + break_reduced_loop_pattern(17, reduced_loop->discard()); + } + } + const auto it = rchcnt_loop_all.find(pos); if (it != rchcnt_loop_all.end()) @@ -7121,7 +8298,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s rb = op.rb; } - if (type & spu_itype::_quadrop && m_use_rc.test(pos / 4)) + if (m_use_rc.test(pos / 4)) { rc = op.rc; } @@ -7169,6 +8346,11 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { for (u32 next_target : ::at32(m_targets, pos)) { + if (next_target == SPU_LS_SIZE) + { + continue; + } + add_block(next_target); } @@ -7353,6 +8535,64 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } + for (const auto& [loop_pc, pattern] : reduced_loop_all) + { + if (!pattern.active || pattern.loop_pc == SPU_LS_SIZE) + { + continue; + } + + if (inst_attr attr = m_inst_attrs[(loop_pc - entry_point) / 4]; attr == inst_attr::none) + { + add_pattern(inst_attr::reduced_loop, loop_pc - result.entry_point, 0, std::make_shared(pattern)); + + std::string regs = "{"; + + for (const auto& [reg_num, reg] : pattern.regs) + { + if (reg.is_loop_dictator(reg_num)) + { + if (regs.size() != 1) + { + regs += ","; + } + + fmt::append(regs, " r%u", reg_num); + } + } + + for (u32 i = 0; i < s_reg_max; i++) + { + if (pattern.loop_writes.test(i)) + { + if (regs.size() != 1) + { + regs += ","; + } + + fmt::append(regs, " r%u-w", i); + } + + if (pattern.loop_args.test(i)) + { + if (regs.size() != 1) + { + regs += ","; + } + + fmt::append(regs, " r%u-r", i); + } + } + + regs += " }"; + + spu_log.success("Reduced Loop Pattern Detected! (REGS: %s, DICT: r%d, ARG: %s, Incr: %s (%s), CMP/Size: %s/%u, loop_pc=0x%x, 0x%x-%s)", regs, pattern.cond_val_register_idx + , pattern.cond_val_is_immediate ? fmt::format("0x%x", pattern.cond_val_min) : fmt::format("r%d", pattern.cond_val_register_argument_idx) + , pattern.cond_val_incr_is_immediate ? fmt::format("%d", static_cast(pattern.cond_val_incr)) : fmt::format("r%d", pattern.cond_val_incr), pattern.cond_val_incr_before_cond ? "BEFORE" : "AFTER" + , pattern.cond_val_compare, std::popcount(pattern.cond_val_mask), loop_pc, entry_point, func_hash); + } + } + if (likely_putllc_loop && !had_putllc_evaluation) { spu_log.notice("Likely missed PUTLLC16 patterns. (entry=0x%x)", entry_point); @@ -7386,7 +8626,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s return result; } -void spu_recompiler_base::dump(const spu_program& result, std::string& out) +void spu_recompiler_base::dump(const spu_program& result, std::string& out, u32 block_min, u32 block_max) { SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast(result.data.data()), result.lower_bound); @@ -7409,10 +8649,18 @@ void spu_recompiler_base::dump(const spu_program& result, std::string& out) hash = "N/A"; } - fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n\n", result.entry_point, result.data.size(), hash); + if (block_min == 0) + { + fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n\n", result.entry_point, result.data.size(), hash); + } for (auto& bb : m_bbs) { + if (bb.first < block_min || bb.first >= block_max) + { + continue; + } + if (m_block_info[bb.first / 4]) { fmt::append(out, "A: [0x%05x] %s [%s]\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block", spu_block_hash{(hash_start & -65536) + bb.first / 4}); @@ -8435,9 +9683,9 @@ std::array& block_reg_info::evaluate_start_state(const s return walkby_state; } -void spu_recompiler_base::add_pattern(inst_attr attr, u32 start, u64 info) +void spu_recompiler_base::add_pattern(inst_attr attr, u32 start, u64 info, std::shared_ptr info_ptr) { - m_patterns[start] = pattern_info{info}; + m_patterns[start] = pattern_info{info, info_ptr}; m_inst_attrs[start / 4] = attr; } diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h index cea4513e3f..93e56e48da 100644 --- a/rpcs3/Emu/Cell/SPUOpcodes.h +++ b/rpcs3/Emu/Cell/SPUOpcodes.h @@ -24,6 +24,20 @@ union spu_opcode_t bf_t i16; // 9..24 bf_t si16; // 9..24, signed bf_t i18; // 7..24 + + // For 16-bit instructions in the context of 32-bits + u32 duplicate_si10() const + { + const u32 _16 = static_cast(static_cast(si10)); + return (_16 << 16) | _16; + } + + // For 8-bit instructions in the context of 32-bits + u32 duplicate_duplicate_si10() const + { + const u32 _8 = static_cast(si10 & 0xff); + return (_8 << 24) | (_8 << 16) | (_8 << 8) | _8; + } }; constexpr u32 spu_branch_target(u32 pc, u32 imm = 0) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 57d842e69d..4348e7e08a 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -4,6 +4,7 @@ #include "Utilities/lockless.h" #include "Utilities/address_range.h" #include "SPUThread.h" +#include "SPUAnalyser.h" #include #include #include @@ -201,6 +202,25 @@ public: __bitset_enum_max }; + enum compare_direction : u32 + { + CMP_TURNAROUND_FLAG = 0x1, + CMP_NEGATE_FLAG = 0x100, + CMP_SLESS = 0, + CMP_SGREATER = CMP_SLESS | CMP_TURNAROUND_FLAG, + CMP_EQUAL, + CMP_EQUAL2 = CMP_EQUAL | CMP_TURNAROUND_FLAG, + CMP_LLESS, + CMP_LGREATER = CMP_LLESS | CMP_TURNAROUND_FLAG, + CMP_SGREATER_EQUAL = CMP_SLESS | CMP_NEGATE_FLAG, + CMP_SLOWER_EQUAL = CMP_SGREATER | CMP_NEGATE_FLAG, + CMP_NOT_EQUAL = CMP_EQUAL | CMP_NEGATE_FLAG, + CMP_NOT_EQUAL2 = CMP_NOT_EQUAL | CMP_TURNAROUND_FLAG, + CMP_LGREATER_EQUAL = CMP_LLESS | CMP_NEGATE_FLAG, + CMP_LLOWER_EQUAL = CMP_LGREATER | CMP_NEGATE_FLAG, + CMP_UNKNOWN, + }; + struct reg_state_t { bs_t flag{+vf::is_null}; @@ -273,6 +293,399 @@ public: static u32 alloc_tag(bool reset = false) noexcept; }; + struct reduced_loop_t + { + bool active = false; // Single block loop detected + bool failed = false; + u32 loop_pc = SPU_LS_SIZE; + u32 loop_end = SPU_LS_SIZE; + + // False: single-block loop + // True: loop with a trailing block of aftermath (iteration update) stuff (like for (u32 i = 0; i < 10; /*update*/ i++)) + bool is_two_block_loop = false; + bool has_cond_state = false; + + // Loop stay-in state requirement + u64 cond_val_mask = umax; + u64 cond_val_min = 0; + u64 cond_val_size = 0; + compare_direction cond_val_compare{}; + u64 cond_val_incr = 0; + bool cond_val_incr_is_immediate = false; + u64 cond_val_register_argument_idx = umax; + u64 cond_val_register_idx = umax; + bool cond_val_incr_before_cond = false; + bool cond_val_incr_before_cond_taken_in_account = false; + bool cond_val_is_immediate = false; + + // Loop attributes + bool is_constant_expression = false; + bool is_secret = false; + + struct supplemental_condition_t + { + u64 immediate_value = umax; + u64 type_size = 0; + compare_direction val_compare{}; + }; + + // Supplemental loop condition: + // Inner conditions that depend on extrnal values (not produced inside the loop) + // all should evaluate to false in order for the optimization to work (at the moment) + // So succeeding can be treated linearly + u64 expected_sup_conds = 0; + u64 current_sup_conds_index = 0; + std::vector sup_conds; + + void take_cond_val_incr_before_cond_into_account() + { + if (cond_val_is_immediate && cond_val_incr_before_cond_taken_in_account && !cond_val_incr_before_cond_taken_in_account) + { + cond_val_min -= cond_val_incr; + cond_val_min &= cond_val_mask; + cond_val_incr_before_cond_taken_in_account = true; + } + } + + std::bitset loop_args; + std::bitset loop_dicts; + std::bitset loop_writes; + + struct origin_t + { + std::bitset regs{}; + u32 modified = 0; + spu_itype_t mod1_type = spu_itype::UNK; + spu_itype_t mod2_type = spu_itype::UNK; + spu_itype_t mod3_type = spu_itype::UNK; + u32 IMM = 0; + +private: + // Internal, please access using fixed order + spu_itype_t access_type(u32 i) const + { + if (i > modified) + { + return spu_itype::UNK; + } + + switch (i) + { + case 1: return mod1_type; + case 2: return mod2_type; + case 3: return mod3_type; + default: return spu_itype::UNK; + } + + return spu_itype::UNK; + } +public: + + spu_itype_t reverse1_type() + { + return access_type(modified); + } + + spu_itype_t reverse2_type() + { + return access_type(modified - 1); + } + + spu_itype_t reverse3_type() + { + return access_type(modified - 2); + } + + origin_t& join_with_this(const origin_t& rhs) + { + regs |= rhs.regs; + return *this; + } + + origin_t& join_with_this(u32 rhs) + { + regs.set(rhs); + return *this; + } + + origin_t& add_register_origin(u32 reg_val) + { + regs.set(reg_val); + return *this; + } + + bool is_single_reg_access(u32 reg_val) const + { + if (!modified) + { + return true; + } + + return regs.count() == 1 && regs.test(reg_val); + } + + bool is_loop_dictator(u32 reg_val, bool test_predictable = false, bool should_predictable = true) const + { + if (!modified) + { + return false; + } + + if (regs.count() >= 1 && regs.test(reg_val)) + { + if (!test_predictable) + { + return true; + } + + if (modified > 1) + { + return should_predictable ^ true; + } + + switch (mod1_type) + { + case spu_itype::A: + { + if (regs.count() == 2) + { + return should_predictable; + } + + return should_predictable ^ true; + } + case spu_itype::AI: + case spu_itype::AHI: + { + if (IMM && regs.count() == 1) + { + return should_predictable; + } + + return should_predictable ^ true; + } + default: break; + } + + return should_predictable ^ true; + } + + return false; + } + + bool is_predictable_loop_dictator(u32 reg_val) const + { + return is_loop_dictator(reg_val, true, true); + } + + bool is_non_predictable_loop_dictator(u32 reg_val) const + { + return is_loop_dictator(reg_val, true, false); + } + + bool is_null(u32 reg_val) const noexcept + { + if (modified) + { + return false; + } + + if (regs.count() - (regs.test(reg_val) ? 1 : 0)) + { + return false; + } + + return true; + } + + origin_t& add_instruction_modifier(spu_itype_t inst_type, u32 imm = 0) + { + if (inst_type == spu_itype::UNK) + { + mod1_type = spu_itype::UNK; + mod2_type = spu_itype::UNK; + mod3_type = spu_itype::UNK; + IMM = umax; + modified = 1; + return *this; + } + + if (modified == 1) + { + if (modified == 3) + { + mod1_type = spu_itype::UNK; + mod2_type = spu_itype::UNK; + mod3_type = spu_itype::UNK; + IMM = umax; + modified = 1; + return *this; + } + + bool is_ok = false; + switch (inst_type) + { + case spu_itype::XSBH: + { + const auto prev_type = modified == 1 ? mod1_type : mod2_type; + is_ok &= mod1_type == spu_itype::CEQB || mod1_type == spu_itype::CEQBI || mod1_type == spu_itype::CGTB || mod1_type == spu_itype::CGTBI || mod1_type == spu_itype::CLGTB || mod1_type == spu_itype::CLGTBI; + break; + } + case spu_itype::ANDI: + { + const auto prev_type = modified == 1 ? mod1_type : mod2_type; + is_ok &= mod1_type == spu_itype::CEQB || mod1_type == spu_itype::CEQBI || mod1_type == spu_itype::CGTB || mod1_type == spu_itype::CGTBI || mod1_type == spu_itype::CLGTB || mod1_type == spu_itype::CLGTBI; + is_ok &= (spu_opcode_t{imm}.si10 & 0xff) == 0xff; + break; + } + case spu_itype::CEQ: + case spu_itype::CEQH: + case spu_itype::CEQB: + case spu_itype::CGT: + case spu_itype::CGTH: + case spu_itype::CGTB: + case spu_itype::CLGT: + case spu_itype::CLGTH: + case spu_itype::CLGTB: + { + is_ok = modified == 1 && (mod1_type == spu_itype::AI || mod1_type == spu_itype::AHI); + IMM = imm; + break; + } + case spu_itype::CEQI: + case spu_itype::CEQHI: + case spu_itype::CEQBI: + case spu_itype::CGTI: + case spu_itype::CGTHI: + case spu_itype::CGTBI: + case spu_itype::CLGTI: + case spu_itype::CLGTHI: + case spu_itype::CLGTBI: + { + is_ok = modified == 1 && (mod1_type == spu_itype::AI || mod1_type == spu_itype::AHI); + IMM = spu_opcode_t{imm}.si10; + break; + } + } + + if (!is_ok) + { + mod1_type = spu_itype::UNK; + mod2_type = spu_itype::UNK; + mod3_type = spu_itype::UNK; + IMM = umax; + modified = 1; + return *this; + } + + (modified == 1 ? mod2_type : mod3_type) = inst_type; + modified++; + return *this; + } + + mod1_type = inst_type; + modified = 1; + + switch (inst_type) + { + case spu_itype::AHI: + { + IMM = spu_opcode_t{imm}.duplicate_si10(); + return *this; + } + case spu_itype::AI: + case spu_itype::ORI: + case spu_itype::XORI: + case spu_itype::ANDI: + + case spu_itype::CEQI: + case spu_itype::CEQHI: + case spu_itype::CEQBI: + case spu_itype::CGTI: + case spu_itype::CGTHI: + case spu_itype::CGTBI: + case spu_itype::CLGTI: + case spu_itype::CLGTHI: + case spu_itype::CLGTBI: + { + IMM = spu_opcode_t{imm}.si10; + return *this; + } + case spu_itype::ILA: + { + IMM = spu_opcode_t{imm}.i18; + return *this; + } + case spu_itype::IOHL: + case spu_itype::ILH: + case spu_itype::ILHU: + { + IMM = spu_opcode_t{imm}.i16; + return *this; + } + default: + { + IMM = imm; + break; + } + } + + return *this; + } + }; + + static origin_t make_reg(u32 reg_val) noexcept + { + origin_t org{}; + org.add_register_origin(reg_val); + return org; + } + + const origin_t* find_reg(u32 reg_val) const noexcept + { + for (auto& pair : regs) + { + if (pair.first == reg_val) + { + return &pair.second; + } + } + + return nullptr; + } + + origin_t* find_reg(u32 reg_val) noexcept + { + return const_cast(std::as_const(*this).find_reg(reg_val)); + } + + bool is_reg_null(u32 reg_val) const noexcept + { + if (const auto reg_found = find_reg(reg_val)) + { + return reg_found->is_null(reg_val); + } + + return true; + } + + origin_t get_reg(u32 reg_val) noexcept + { + const auto org = find_reg(reg_val); + return org ? *org : regs.emplace_back(reg_val, std::remove_reference_t{}).second; + } + + std::vector> regs; + + // Return old state for error reporting + reduced_loop_t discard() + { + const reduced_loop_t old = *this; + *this = reduced_loop_t{}; + return old; + } + }; + protected: spu_runtime* m_spurt{}; @@ -391,18 +804,23 @@ protected: putllc16, putllc0, rchcnt_loop, + reduced_loop, }; std::vector m_inst_attrs; struct pattern_info { - u64 info; + // Info via integral + u64 info{}; + + // Info via additional erased-typed pointer + std::shared_ptr info_ptr; }; - std::unordered_map m_patterns; + std::map m_patterns; - void add_pattern(inst_attr attr, u32 start, u64 info); + void add_pattern(inst_attr attr, u32 start, u64 info, std::shared_ptr info_ptr = nullptr); private: // For private use @@ -435,7 +853,7 @@ public: spu_program analyse(const be_t* ls, u32 entry_point, std::map>* out_target_list = nullptr); // Print analyser internal state - void dump(const spu_program& result, std::string& out); + void dump(const spu_program& result, std::string& out, u32 block_min = 0, u32 block_max = SPU_LS_SIZE); // Get SPU Runtime spu_runtime& get_runtime() From 0a12e482f4e7c24313b4751da2eeabfbde96d641 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Tue, 10 Feb 2026 18:45:17 +0200 Subject: [PATCH 02/11] SPU LLVM: Implement reduced loop --- rpcs3/Emu/CPU/CPUTranslator.h | 9 + rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 2 + rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 576 ++++++++++++++++++++++++- rpcs3/Emu/Cell/SPUOpcodes.h | 1 + rpcs3/Emu/Cell/SPUThread.cpp | 61 ++- 5 files changed, 639 insertions(+), 10 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 9b9804fd39..4a0028c571 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3951,6 +3951,15 @@ public: erase_stores({args.value...}); } + // Debug breakpoint + void debugtrap() + { + const auto _rty = llvm::Type::getVoidTy(m_context); + const auto type = llvm::FunctionType::get(_rty, {}, false); + const auto func = llvm::cast(m_ir->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction("llvm.debugtrap", type).getCallee()); + m_ir->CreateCall(func); + } + template static auto pshufb(T&& a, U&& b) { diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 4b52ce8c43..eff272753a 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -8544,6 +8544,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (inst_attr attr = m_inst_attrs[(loop_pc - entry_point) / 4]; attr == inst_attr::none) { + const u64 hash = loop_pc / 4 + read_from_ptr>(func_hash.data()); + add_pattern(inst_attr::reduced_loop, loop_pc - result.entry_point, 0, std::make_shared(pattern)); std::string regs = "{"; diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 856a039e5e..103f48085d 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -139,7 +139,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator spu_recompiler_base::block_info* bb{}; // Current block's entry block - llvm::BasicBlock* block; + llvm::BasicBlock* block{}; // Final block (for PHI nodes, set after completion) llvm::BasicBlock* block_end{}; @@ -155,6 +155,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Store instructions std::array store{}; + bool block_wide_reg_store_elimination = false; // Store reordering/elimination protection std::array store_context_last_id = fill_array(0); // Protects against illegal forward ordering @@ -364,7 +365,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) { - m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); + m_block->reg[i] = get_reg_fixed(i, get_reg_type(i)); } } @@ -709,6 +710,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!reg) { + if (m_block && m_block->block_wide_reg_store_elimination) + { + fmt::throw_exception("Unexpected load: [%s] at 0x%x (gpr=r%d)", m_hash, m_pos, index); + } + // Load register value if necessary reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index)); } @@ -920,6 +926,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (m_block) { + if (m_block->block_wide_reg_store_elimination) + { + // Don't save registers for the current block iteration + // Affected optimizations: + // 1. Single-block reduced loop + return; + } + // Keep the store's location in history of gpr preservaions m_block->store_context_last_id[index] = m_block->store_context_ctr[index]; m_block->store_context_first_id[index] = std::min(m_block->store_context_first_id[index], m_block->store_context_ctr[index]); @@ -2059,6 +2073,43 @@ public: bool need_check = false; m_block->bb = &bb; + // [1gJ45f2-0x00a40]: 16.4982% (113258) + // [ZsQTud1-0x0924c]: 6.1202% (42014) + // [ZsQTud1-0x08e54]: 5.6610% (38862) + // [0000000-0x3fffc]: 4.3764% (30043) + // [Zh4tpJM-0x00bcc]: 3.7908% (26023) + // [CFt8hXu-0x063b8]: 3.6177% (24835) + // [8YJCUjv-0x0ad18]: 3.2417% (22254) + // [Try3XHn-0x0f018]: 2.3721% (16284) + // [s6ti9iu-0x07678]: 1.8464% (12675) + // [oyxkAPv-0x0c22c]: 1.7776% (12203) + // [Q0jLqH4-0x00324]: 1.6015% (10994) + static const std::array, 4> to_nop + { + { } + }; + + bool found_block = false; + + for (auto& [hash, pos] : to_nop) + { + if (m_hash.find(hash) <= 2 && baddr == pos) + { + found_block = true; + break; + } + } + + if (found_block) + { + for (u32 i = 0; i < 100; i++) + { + auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::last_getllar_lsa)); + auto mod_val = m_ir->CreateFDiv(value, llvm::ConstantFP::get(value->getType(), 1.1 + i)); + m_ir->CreateStore(value, spu_ptr(&spu_thread::last_getllar_lsa)); + } + } + if (!bb.preds.empty()) { // Initialize registers and build PHI nodes if necessary @@ -2174,6 +2225,485 @@ public: check_state(baddr); } + const bool is_reduced_loop = m_inst_attrs[(baddr - start) / 4] == inst_attr::reduced_loop; + const auto reduced_loop_info = is_reduced_loop ? std::static_pointer_cast(ensure(m_patterns.at(baddr - start).info_ptr)) : nullptr; + + BasicBlock* block_optimization_phi_parent = nullptr; + const auto block_optimization_inner = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-it-0x%x", m_pos), m_function) : nullptr; + const auto block_optimization_exit_early = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b-loop-exit-0x%x", m_pos), m_function) : nullptr; + const auto block_optimization_next = is_reduced_loop ? BasicBlock::Create(m_context, fmt::format("b2-0x%x", m_pos), m_function) : nullptr; + + std::array reduced_loop_phi_nodes{}; + std::array reduced_loop_init_regs{}; + + auto make_reduced_loop_condition = [&](llvm::BasicBlock* optimization_block, bool is_second_time, u32 reserve_iterations) + { + llvm::ICmpInst::Predicate compare{}; + + switch (reduced_loop_info->cond_val_compare) + { + case CMP_SLESS: compare = ICmpInst::ICMP_SLT; break; + case CMP_SGREATER: compare = ICmpInst::ICMP_SGT; break; + case CMP_EQUAL: compare = ICmpInst::ICMP_EQ; break; + case CMP_LLESS: compare = ICmpInst::ICMP_ULT; break; + case CMP_LGREATER: compare = ICmpInst::ICMP_UGT; break; + case CMP_SGREATER_EQUAL: compare = ICmpInst::ICMP_SGE; break; + case CMP_SLOWER_EQUAL: compare = ICmpInst::ICMP_SLE; break; + case CMP_NOT_EQUAL: compare = ICmpInst::ICMP_NE; break; + case CMP_LGREATER_EQUAL: compare = ICmpInst::ICMP_UGE; break; + case CMP_LLOWER_EQUAL: compare = ICmpInst::ICMP_ULE; break; + { + break; + } + case CMP_UNKNOWN: + case CMP_NOT_EQUAL2: + case CMP_EQUAL2: + default: + { + ensure(false); + break; + } + } + + llvm::Value* loop_dictator_before_adjustment{}; + llvm::Value* loop_dictator_after_adjustment{}; + + spu_opcode_t reg_target{}; + reg_target.rt = reduced_loop_info->cond_val_register_idx; + + if (reg_target.rt != reduced_loop_info->cond_val_register_idx) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition register index: 0x%llx", reduced_loop_info->cond_val_register_idx); + } + + if (!m_block->reg[reg_target.rt]) + { + m_block->reg[reg_target.rt] = reduced_loop_init_regs[reg_target.rt]; + } + + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + loop_dictator_before_adjustment = get_scalar(get_vr(reg_target.rt)).eval(m_ir); + break; + } + default: + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition bit mask: 0x%llx", reduced_loop_info->cond_val_mask); + } + } + + const u32 type_bits = std::popcount(reduced_loop_info->cond_val_mask); + + llvm::Value* cond_val_incr = nullptr; + + if (reduced_loop_info->cond_val_incr_is_immediate) + { + cond_val_incr = m_ir->getIntN(type_bits, reduced_loop_info->cond_val_incr & reduced_loop_info->cond_val_mask); + } + else + { + spu_opcode_t reg_incr{}; + reg_incr.rt = reduced_loop_info->cond_val_incr; + + if (reg_incr.rt != reduced_loop_info->cond_val_incr) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal increment arguemnt register index: 0x%llx", reduced_loop_info->cond_val_incr); + } + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + cond_val_incr = get_scalar(get_vr(reg_incr.rt)).eval(m_ir); + break; + } + } + } + + if (reduced_loop_info->cond_val_incr_before_cond && !reduced_loop_info->cond_val_incr_before_cond_taken_in_account) + { + loop_dictator_after_adjustment = m_ir->CreateAdd(loop_dictator_before_adjustment, cond_val_incr); + } + else + { + loop_dictator_after_adjustment = loop_dictator_before_adjustment; + } + + llvm::Value* loop_argument = nullptr; + + if (reduced_loop_info->cond_val_is_immediate) + { + loop_argument = m_ir->CreateTrunc(m_ir->getInt64(reduced_loop_info->cond_val_min & reduced_loop_info->cond_val_mask), loop_dictator_before_adjustment->getType()); + } + else + { + spu_opcode_t reg_target2{}; + reg_target2.rt = reduced_loop_info->cond_val_register_argument_idx; + + if (reg_target2.rt != reduced_loop_info->cond_val_register_argument_idx) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Illegal condition arguemnt register index: 0x%llx", reduced_loop_info->cond_val_register_argument_idx); + } + + switch (reduced_loop_info->cond_val_mask) + { + case u8{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u16{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u32{umax}: + { + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + case u64{umax}: + { + ensure(false); // TODO + loop_argument = get_scalar(get_vr(reg_target2.rt)).eval(m_ir); + break; + } + } + } + + llvm::Value* condition = nullptr; + + if (reserve_iterations == 1) + { + condition = m_ir->CreateICmp(compare, loop_dictator_after_adjustment, loop_argument); + } + // else if ((reduced_loop_info->cond_val_compare == CMP_LGREATER || (reduced_loop_info->cond_val_compare == CMP_LGREATER_EQUAL && reduced_loop_info->cond_val_is_immediate && reduced_loop_info->cond_val_incr)) && cond_val_incr->getSExtValue() < 0) + // { + // const auto cond_val_incr_multiplied = m_ir->CreateMul(cond_val_incr, reserve_iterations - 1); + // condition = m_ir->CreateICmp(compare, select(m_ir->CreateICmpUGE(cond_val_incr_multiplied, loop_dictator_after_adjustment), m_ir->CreateAdd(loop_dictator_after_adjustment, cond_val_incr_multiplied), m_ir->getIntN(type_bits, 0)), loop_argument); + // } + else + { + //debugtrap(); + + llvm::Value* prev_it = loop_dictator_after_adjustment; + + for (u32 i = 0; i < reserve_iterations; i++) + { + if (i) + { + prev_it = m_ir->CreateAdd(prev_it, cond_val_incr); + } + + const auto also_cond = m_ir->CreateICmp(compare, prev_it, loop_argument); + condition = condition ? m_ir->CreateAnd(condition, also_cond) : also_cond; + } + } + + if (!is_second_time) + { + for (u32 i = 0, count = 0, prev_i = umax;; i++) + { + const bool is_last = !(count <= 20 && i < s_reg_max); + + if (is_last || m_block->is_gpr_not_NaN_hint(i)) + { + count++; + + if (prev_i == umax) + { + if (!is_last) + { + prev_i = i; + continue; + } + + break; + } + + auto access_gpr = [&](u32 index) + { + spu_opcode_t op_arg{}; + op_arg.ra = index; + return get_vr(op_arg.ra); + }; + + // OR LSB to convert infinity to NaN + llvm::Value* arg1 = bitcast(access_gpr(prev_i) | splat(1)).eval(m_ir); + llvm::Value* arg2 = is_last ? arg1 : bitcast(access_gpr(i) | splat(1)).eval(m_ir); + + llvm::Value* acc = m_ir->CreateSExt(m_ir->CreateFCmpUNO(arg1, arg2), get_type()); + + // Pattern for PTEST + acc = m_ir->CreateBitCast(acc, get_type()); + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u64 i = 1; i < 2; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + // Compare result with zero + const auto cond_nans = m_ir->CreateICmpEQ(elem, m_ir->getInt64(0)); + condition = m_ir->CreateAnd(cond_nans, condition); + prev_i = umax; + } + } + } + + //condition = m_ir->getInt1(0); + + m_ir->CreateCondBr(condition, optimization_block, block_optimization_next); + }; + + if (is_reduced_loop) + { + for (u32 i = 0; i < s_reg_max; i++) + { + llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + if (i < reduced_loop_info->loop_dicts.size() && (reduced_loop_info->loop_dicts.test(i) || reduced_loop_info->loop_writes.test(i))) + { + // Connect registers which are used and then modified by the block + auto value = m_block->reg[i]; + + if (!value || value->getType() != type) + { + value = get_reg_fixed(i, type); + } + + reduced_loop_init_regs[i] = value; + } + else if (i < reduced_loop_info->loop_dicts.size() && reduced_loop_info->loop_args.test(i)) + { + // Load registers used as arguments of the loop + if (!m_block->reg[i]) + { + m_block->reg[i] = get_reg_fixed(i, type); + } + } + } + + const auto prev_insert_block = m_ir->GetInsertBlock(); + + block_optimization_phi_parent = prev_insert_block; + + make_reduced_loop_condition(block_optimization_inner, false, 2); + m_ir->SetInsertPoint(block_optimization_inner); + + for (u32 i = 0; i < s_reg_max; i++) + { + if (auto init_val = reduced_loop_init_regs[i]) + { + llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + const auto _phi = m_ir->CreatePHI(init_val->getType(), 2, fmt::format("reduced_0x%05x_r%u", baddr, i)); + _phi->addIncoming(init_val, prev_insert_block); + + reduced_loop_phi_nodes[i] = _phi; + m_block->reg[i] = _phi; + } + } + + m_block->block_wide_reg_store_elimination = true; + } + + // Instructions emitting optimizations: Loop iteration is not the last + m_pos = baddr; + + // Masked opcodde -> register modification times + std::map>> masked_times; + std::array reg_states{}; + u32 s_reg_state{1}; + + for (u32 iteration_emit = 0; is_reduced_loop; m_pos += 4) + { + if (m_pos != baddr && m_block_info[m_pos / 4] && reduced_loop_info->loop_end < m_pos) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(1) too early at 0x%x", m_pos); + } + + if (!(m_pos >= start && m_pos < end)) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(2) too early at 0x%x", m_pos); + } + + if (m_ir->GetInsertBlock()->getTerminator()) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: Exit(3) too early at 0x%x", m_pos); + } + + const u32 op = std::bit_cast>(func.data[(m_pos - start) / 4]); + const auto itype = g_spu_itype.decode(op); + + if (itype & spu_itype::branch) + { + bool branches_back = false; + + for (u32 dest : op_branch_targets(m_pos, spu_opcode_t{op})) + { + branches_back = branches_back || dest == baddr; + } + + if (!branches_back) + { + continue; + } + + iteration_emit++; + + if (iteration_emit < 2) + { + // Reset mpos (with fixup) + m_pos = baddr - 4; + continue; + } + + // Optimization block body + const auto block_inner = m_ir->GetInsertBlock(); + + std::array block_reg_results{}; + + for (u32 i = 0; i < s_reg_max; i++) + { + if (auto phi = reduced_loop_phi_nodes[i]) + { + const auto type = phi->getType() == get_type() ? get_type() : get_reg_type(i); + block_reg_results[i] = ensure(get_reg_fixed(i, type)); + phi->addIncoming(block_reg_results[i], block_inner); + } + } + + ensure(!!m_block->reg[reduced_loop_info->cond_val_register_idx]); + make_reduced_loop_condition(block_optimization_inner, true, 2); + m_ir->SetInsertPoint(block_optimization_next); + m_block->block_wide_reg_store_elimination = false; + + for (u32 i = 0; i < s_reg_max; i++) + { + if (const auto loop_value = block_reg_results[i]) + { + const auto phi = m_ir->CreatePHI(loop_value->getType(), 2, fmt::format("redres_0x%05x_r%u", baddr, i)); + + phi->addIncoming(loop_value, block_inner); + phi->addIncoming(reduced_loop_init_regs[i], block_optimization_phi_parent); + m_block->reg[i] = phi; + } + } + + + break; + } + + if (!op) + { + fmt::throw_exception("LLVM: Reduced Loop Pattern: [%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]); + } + + const auto [reg_rt, reg_access, masked_op] = op_register_targets(m_pos, spu_opcode_t{op}); + + bool erased = false; + + const auto inst_times = std::array{reg_states[reg_access[0]], reg_states[reg_access[1]], reg_states[reg_access[2]]}; + + // Try to reuse the reult of the previous iteration (if argumnent registers have not been modified) + if (reg_rt < 128 && masked_times.count(masked_op) && masked_times[masked_op].first && m_inst_attrs[(m_pos - start) / 4] == inst_attr::none) + { + auto times = masked_times[masked_op].second; + + bool is_ok = true; + for (u32 regi = 0; regi < 3; regi++) + { + if (reg_access[regi] < 128 && times[regi] != inst_times[regi]) + { + is_ok = false; + } + } + + if (is_ok) + { + m_block->reg[reg_rt] = masked_times[masked_op].first; + erased = true; + } + } + + if (reg_rt < 128) + { + reg_states[reg_rt] = s_reg_state++; + } + + if (erased) + { + continue; + } + + m_next_op = 0; + + masked_times[masked_op] = {}; + + switch (m_inst_attrs[(m_pos - start) / 4]) + { + case inst_attr::putllc0: + { + putllc0_pattern(func, m_patterns.at(m_pos - start).info); + continue; + } + case inst_attr::putllc16: + { + putllc16_pattern(func, m_patterns.at(m_pos - start).info); + continue; + } + case inst_attr::omit: + { + // TODO + continue; + } + default: break; + } + + // Execute recompiler function (TODO) + (this->*decode(op))({op}); + + if (reg_rt < 128 && itype & spu_itype::pure && reg_rt != reg_access[0] && reg_rt != reg_access[1] && reg_rt != reg_access[2]) + { + masked_times[masked_op] = {ensure(m_block->reg[reg_rt]), inst_times}; + } + } + // Emit instructions for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4) { @@ -7824,13 +8354,51 @@ public: } } - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); + const auto a = get_vr(op.ra); + + if (auto [ok, x, y] = match_expr(a, match() + match()); ok) + { + if (auto [ok1, data] = get_const_vector(x.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(y, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + make_store_ls(addr, get_vr(op.rt)); + return; + } + + if (auto [ok2, data] = get_const_vector(y.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(x, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + make_store_ls(addr, get_vr(op.rt)); + return; + } + } + + value_t addr = eval(zext(extract(a, 3) & 0x3fff0) + (get_imm(op.si10) << 4)); make_store_ls(addr, get_vr(op.rt)); } void LQD(spu_opcode_t op) { - value_t addr = eval(zext(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm(op.si10) << 4)); + const auto a = get_vr(op.ra); + + if (auto [ok, x1, y1] = match_expr(a, match() + match()); ok) + { + if (auto [ok1, data] = get_const_vector(x1.value, m_pos + 1); ok1 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(y1, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + set_vr(op.rt, make_load_ls(addr)); + return; + } + + if (auto [ok2, data] = get_const_vector(y1.value, m_pos + 2); ok2 && data._u32[3] % 16 == 0) + { + value_t addr = eval(zext(extract(x1, 3) & 0x3fff0) + ((get_imm(op.si10) << 4) + splat(data._u32[3] & 0x3fff0))); + set_vr(op.rt, make_load_ls(addr)); + return; + } + } + + value_t addr = eval(zext(extract(a, 3) & 0x3fff0) + (get_imm(op.si10) << 4)); set_vr(op.rt, make_load_ls(addr)); } diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h index 93e56e48da..42d76792a2 100644 --- a/rpcs3/Emu/Cell/SPUOpcodes.h +++ b/rpcs3/Emu/Cell/SPUOpcodes.h @@ -56,6 +56,7 @@ constexpr u32 spu_decode(u32 inst) } std::array op_branch_targets(u32 pc, spu_opcode_t op); +std::tuple, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op); // SPU decoder object. D provides functions. T is function pointer type returned. template diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index b9a77d7696..e4346bc3df 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -495,7 +495,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write); extern thread_local u64 g_tls_fault_spu; -const spu_decoder s_spu_itype; +const extern spu_decoder g_spu_itype; +const extern spu_decoder g_spu_iflag; namespace vm { @@ -598,7 +599,7 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) { std::array res{spu_branch_target(pc + 4), umax}; - switch (const auto type = s_spu_itype.decode(op.opcode)) + switch (const auto type = g_spu_itype.decode(op.opcode)) { case spu_itype::BR: case spu_itype::BRA: @@ -639,6 +640,54 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) return res; } +std::tuple, u32> op_register_targets(u32 /*pc*/, spu_opcode_t op) +{ + std::tuple, u32> result{u32{umax}, std::array{128, 128, 128}, op.opcode}; + + const auto type = g_spu_itype.decode(op.opcode); + + if (type & spu_itype::zregmod) + { + std::get<2>(result) = 0; + return result; + } + + std::get<0>(result) = type & spu_itype::_quadrop ? op.rt4 : op.rt; + + spu_opcode_t op_masked = op; + + if (type & spu_itype::_quadrop) + { + op_masked.rt4 = 0; + } + else + { + op_masked.rt = 0; + } + + std::get<2>(result) = op_masked.opcode; + + if (auto iflags = g_spu_iflag.decode(op.opcode)) + { + if (+iflags & +spu_iflag::use_ra) + { + std::get<1>(result)[0] = op.ra; + } + + if (+iflags & +spu_iflag::use_rb) + { + std::get<1>(result)[1] = op.rb; + } + + if (+iflags & +spu_iflag::use_rc) + { + std::get<1>(result)[2] = op.rc; + } + } + + return result; +} + void spu_int_ctrl_t::set(u64 ints) { // leave only enabled interrupts @@ -988,7 +1037,7 @@ std::vector> spu_thread::dump_callstack_list() const passed[i / 4] = true; const spu_opcode_t op{_ref(i)}; - const auto type = s_spu_itype.decode(op.opcode); + const auto type = g_spu_itype.decode(op.opcode); if (start == 0 && type == spu_itype::STQD && op.ra == 1u && op.rt == 0u) { @@ -3761,7 +3810,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span ls_ptr, u32 base_add const u32 addr0 = spu_branch_target(addr); const spu_opcode_t op{read_from_ptr>(ls_ptr, addr0 - base_addr)}; - const auto type = s_spu_itype.decode(op.opcode); + const auto type = g_spu_itype.decode(op.opcode); if (type == spu_itype::UNK || !op.opcode) { @@ -3907,7 +3956,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span ls_ptr, u32 base_add // Test the validity of a single instruction of the optional target // This function can't be too slow and is unlikely to improve results by a great deal const u32 op0 = read_from_ptr>(ls_ptr, route_pc - base_addr); - const spu_itype::type type0 = s_spu_itype.decode(op0); + const spu_itype::type type0 = g_spu_itype.decode(op0); if (type0 == spu_itype::UNK || !op0) { @@ -6878,7 +6927,7 @@ spu_exec_object spu_thread::capture_memory_as_elf(std::span>(all_data, pc0 - 4); // Try to find function entry (if they are placed sequentially search for BI $LR of previous function) - if (!op || op == 0x35000000u || s_spu_itype.decode(op) == spu_itype::UNK) + if (!op || op == 0x35000000u || g_spu_itype.decode(op) == spu_itype::UNK) { if (is_exec_code(pc0, { all_data.data(), SPU_LS_SIZE })) break; From 9fb8ee7e2af5932a27c0b6a73f336a7d545b3640 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:16:25 +0200 Subject: [PATCH 03/11] SPU LLVM: CEQHI pattern --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 103f48085d..59f43cd6dd 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6286,11 +6286,59 @@ public: void CEQI(spu_opcode_t op) { + // CEQHI following a comparison instruction (compare-equal negation) + if (!m_interp_magn && !op.si10 && match_vr(op.ra, [&](auto c, auto MT) + { + using VT = typename decltype(MT)::type; + using VT_HALF = s16[8]; + + if (auto [ok, a, b] = match_expr(c, bitcast(sext(match() == match())) << 16 >> 16); ok && m_block->block_wide_reg_store_elimination) + { + set_vr(op.rt, bitcast(sext(a != b)) << 16 >> 16); + return true; + } + + if (auto [ok, a, b] = match_expr(c, sext(MT == MT)); ok) + { + set_vr(op.rt, sext(a != b)); + return true; + } + + return false; + })) + { + return; + } + set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); } void CEQHI(spu_opcode_t op) { + // CEQHI following a comparison instruction (compare-equal negation) + if (!m_interp_magn && !op.si10 && match_vr(op.ra, [&](auto c, auto MT) + { + using VT = typename decltype(MT)::type; + using VT_HALF = s8[16]; + + if (auto [ok, a, b] = match_expr(c, bitcast(sext(match() == match())) << 8 >> 8); ok && m_block->block_wide_reg_store_elimination) + { + set_vr(op.rt, bitcast(sext(a != b)) << 8 >> 8); + return true; + } + + if (auto [ok, a, b] = match_expr(c, sext(match() == match())); ok) + { + set_vr(op.rt, sext(a != b)); + return true; + } + + return false; + })) + { + return; + } + set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); } From af98aedce53b908c9579651ea1757190bf4db65c Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:56:35 +0200 Subject: [PATCH 04/11] SPU LLVM: Allow the reuse of address from LQX/STQX --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 59f43cd6dd..71217fb0bf 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -8319,12 +8319,18 @@ public: { data._u32[3] %= SPU_LS_SIZE; - if (data._u32[3] % 0x10 == 0) + if (const u32 remainder = data._u32[3] % 0x10; remainder == 0) { value_t addr = eval(splat(data._u32[3]) + zext(extract(pair.second, 3) & 0x3fff0)); make_store_ls(addr, get_vr(op.rt)); return; } + else + { + value_t addr = eval(splat(data._u32[3] - remainder) + zext((extract(pair.second, 3) + remainder) & 0x3fff0)); + make_store_ls(addr, get_vr(op.rt)); + return; + } } } @@ -8343,12 +8349,18 @@ public: { data._u32[3] %= SPU_LS_SIZE; - if (data._u32[3] % 0x10 == 0) + if (const u32 remainder = data._u32[3] % 0x10; remainder == 0) { value_t addr = eval(splat(data._u32[3]) + zext(extract(pair.second, 3) & 0x3fff0)); set_vr(op.rt, make_load_ls(addr)); return; } + else + { + value_t addr = eval(splat(data._u32[3] - remainder) + zext((extract(pair.second, 3) + remainder) & 0x3fff0)); + set_vr(op.rt, make_load_ls(addr)); + return; + } } } From 4d2537a6f4cb533a3d4826c56d129e3e85a11097 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:54:58 +0200 Subject: [PATCH 05/11] Add floating ppoint hint --- rpcs3/Emu/Cell/SPUAnalyser.h | 24 ++++++++++++------------ rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 21 +++++++++++++-------- rpcs3/Emu/Cell/SPURecompiler.h | 7 +++++-- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index 59560cd182..a6c788faad 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -158,6 +158,15 @@ struct spu_itype CUFLT, FRDS, // xfloat_tag last + CFLTS, + CFLTU, + FCEQ, + FCMEQ, + FCGT, + FCMGT, // floating_tag last + FSCRWR, + FSCRRD, + DFA, DFS, DFM, @@ -167,20 +176,11 @@ struct spu_itype DFNMA, FESD, - CFLTS, - CFLTU, - FCEQ, - FCMEQ, - FCGT, - FCMGT, - FSCRWR, - FSCRRD, - DFCEQ, DFCMEQ, DFCGT, DFCMGT, - DFTSV, // floating_tag last + DFTSV, SHLH, // shiftrot_tag first SHLHI, @@ -248,10 +248,10 @@ struct spu_itype return value >= BR && value <= BISL; } - // Test for floating point instruction + // Test for floating point instruction (32-bit float) friend constexpr bool operator &(type value, floating_tag) { - return value >= FMA && value <= DFTSV; + return value >= FMA && value <= FCMGT; } // Test for 4-op instruction diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index eff272753a..9da576f009 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -3933,7 +3933,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const auto type = g_spu_itype.decode(op.opcode); - u8 reg_save = 255; + u8 reg_save = s_reg_max; if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) { @@ -3953,7 +3953,12 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // Register reg use only if it happens before reg mod if (!block.reg_mod[reg]) { - block.reg_use.set(reg); + if (type & spu_itype::floating) + { + block.reg_maybe_float.set(reg); + } + + block.reg_use[reg]++; if (reg_save != reg && block.reg_save_dom[reg]) { @@ -3970,7 +3975,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size}) { if (!block.reg_mod[reg]) - block.reg_use.set(reg); + block.reg_use[reg]++; } } @@ -4024,7 +4029,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127) { if (!block.reg_mod[i]) - block.reg_use.set(i); + block.reg_use[i]++; if (!is_tail) { @@ -6353,9 +6358,9 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (!reduced_loop->loop_dicts.test(i)) { - if (b.reg_use.test(i) || (!b.reg_mod.test(i) && b2.reg_use.test(i))) + if (b.reg_use[i] || (!b.reg_mod.test(i) && b2.reg_use[i])) { - if ((b.reg_use.test(i) && b.reg_mod.test(i)) || b2.reg_mod.test(i)) + if ((b.reg_use[i] && b.reg_mod.test(i)) || b2.reg_mod.test(i)) { reduced_loop->is_constant_expression = false; reduced_loop->loop_writes.set(i); @@ -7032,9 +7037,9 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (!reduced_loop->loop_dicts.test(i)) { - if (b.reg_use.test(i) || (!b.reg_mod.test(i) && b2.reg_use.test(i))) + if (b.reg_use[i] || (!b.reg_mod.test(i) && b2.reg_use[i])) { - if ((b.reg_use.test(i) && b.reg_mod.test(i)) || b2.reg_mod.test(i)) + if ((b.reg_use[i] && b.reg_mod.test(i)) || b2.reg_mod.test(i)) { reduced_loop->is_constant_expression = false; reduced_loop->loop_writes.set(i); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 4348e7e08a..00d89127d6 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -739,8 +739,11 @@ protected: // Set if the initial register value in this block may be xfloat std::bitset reg_maybe_xf{}; - // Bit mask of the registers used (before modified) - std::bitset reg_use{}; + // Set if register is used in floating pont instruction + std::bitset reg_maybe_float{}; + + // Number of times registers are used (before modified) + std::array reg_use{}; // Bit mask of the trivial (u32 x 4) constant value resulting in this block std::bitset reg_const{}; From eb4c4d3d22bf79bfd725bbcfe225a14d04e1f9a0 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Sun, 8 Mar 2026 15:36:58 +0200 Subject: [PATCH 06/11] SPU LLVM: Optimize FM, FMA and FCGT in Reduced Loop --- rpcs3/Emu/CPU/CPUTranslator.h | 32 ++++++ rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 5 + rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 130 ++++++++++++++++--------- rpcs3/Emu/Cell/SPURecompiler.h | 3 + 4 files changed, 124 insertions(+), 46 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 4a0028c571..738932808d 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -567,6 +567,32 @@ struct llvm_placeholder_t } }; +template >> +struct llvm_place_stealer_t +{ + // TODO: placeholder extracting actual constant values (u64, f64, vector, etc) + + using type = T; + + static constexpr bool is_ok = true; + + llvm::Value* eval(llvm::IRBuilder<>*) const + { + return nullptr; + } + + std::tuple<> match(llvm::Value*& value, llvm::Module*) const + { + if (value && value->getType() == llvm_value_t::get_type(value->getContext())) + { + return {}; + } + + value = nullptr; + return {}; + } +}; + template struct llvm_const_int { @@ -3227,6 +3253,12 @@ public: return {}; } + template + static llvm_place_stealer_t match_stealer() + { + return {}; + } + template requires requires { typename llvm_common_t; } static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr) diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 9da576f009..96d4afa95e 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -3958,6 +3958,11 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s block.reg_maybe_float.set(reg); } + if (type == spu_itype::SHUFB && reg == op.rc) + { + block.reg_maybe_shuffle_mask.set(reg); + } + block.reg_use[reg]++; if (reg_save != reg && block.reg_save_dom[reg]) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 71217fb0bf..baecbd3135 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -175,6 +175,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const usz first_id = store_context_first_id[i]; return counter != 1 && first_id != umax && counter < first_id; } + + bool is_gpr_not_NaN_hint(u32 i) const noexcept + { + return block_wide_reg_store_elimination && bb->reg_maybe_float[i] && bb->reg_use[i] >= 3 && !bb->reg_mod[i]; + } }; struct function_info @@ -1692,7 +1697,7 @@ public: // Emit state check const auto pstate = spu_ptr(&spu_thread::state); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); + m_ir->CreateCondBr(m_ir->CreateICmpNE(spu_context_attr(m_ir->CreateLoad(get_type(), pstate)), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); // Emit code check u32 check_iterations = 0; @@ -6915,8 +6920,13 @@ public: return eval(bitcast(min(bitcast(v),splat(0xff7fffff)))); } - value_t clamp_smax(value_t v) + value_t clamp_smax(value_t v, u32 gpr = s_reg_max) { + if (m_block && gpr < s_reg_max && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(gpr)) + { + return v; + } + if (m_use_avx512) { if (is_input_positive(v)) @@ -6936,16 +6946,6 @@ public: return eval(clamp_positive_smax(clamp_negative_smax(v))); } - // FMA favouring zeros - value_t xmuladd(value_t a, value_t b, value_t c) - { - const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); - const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); - const auto ca = eval(bitcast(bitcast(a) & mb)); - const auto cb = eval(bitcast(bitcast(b) & ma)); - return eval(fmuladd(ca, cb, c)); - } - // Checks for postive and negative zero, or Denormal (treated as zero) // If sign is +-1 check equality againts all sign bits bool is_spu_float_zero(v128 a, int sign = 0) @@ -7032,12 +7032,6 @@ public: set_vr(op.rt, frsqest(get_vr(op.ra))); } - template - static llvm_calli fcgt(T&& a, U&& b) - { - return {"spu_fcgt", {std::forward(a), std::forward(b)}}; - } - void FCGT(spu_opcode_t op) { if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) @@ -7046,11 +7040,8 @@ public: return; } - register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci) + const auto fcgt = [&](value_t a, value_t b) { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - const value_t ab[2]{a, b}; std::bitset<2> safe_int_compare(0); @@ -7082,6 +7073,16 @@ public: } } + if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra)) + { + safe_finite_compare.set(0); + } + + if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb)) + { + safe_finite_compare.set(1); + } + if (safe_int_compare.any()) { return eval(sext(bitcast(a) > bitcast(b))); @@ -7101,7 +7102,7 @@ public: const auto bi = eval(bitcast(b)); return eval(sext(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); - }); + }; set_vr(op.rt, fcgt(get_vr(op.ra), get_vr(op.rb))); } @@ -7198,12 +7199,6 @@ public: set_vr(op.rt, fa(get_vr(op.ra), get_vr(op.rb))); } - template - static llvm_calli fs(T&& a, U&& b) - { - return {"spu_fs", {std::forward(a), std::forward(b)}}; - } - void FS(spu_opcode_t op) { if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) @@ -7212,29 +7207,26 @@ public: return; } - register_intrinsic("spu_fs", [&](llvm::CallInst* ci) + const auto fs = [&](value_t a, value_t b) { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { - const auto bc = clamp_smax(b); // for #4478 + const auto bc = clamp_smax(b, op.rb); // for #4478 return eval(a - bc); } else { return eval(a - b); } - }); + }; set_vr(op.rt, fs(get_vr(op.ra), get_vr(op.rb))); } - template - static llvm_calli fm(T&& a, U&& b) + template , typename W = llvm_place_stealer_t> + static auto fm(T&& a, U&& b, V&& a_not_nan = match_stealer(), W&& b_not_nan = match_stealer()) { - return llvm_calli{"spu_fm", {std::forward(a), std::forward(b)}}.set_order_equality_hint(1, 1); + return llvm_calli{"spu_fm", {std::forward(a), std::forward(b), a_not_nan, b_not_nan}}.set_order_equality_hint(1, 1, 2, 3); } void FM(spu_opcode_t op) @@ -7249,14 +7241,27 @@ public: { const auto a = value(ci->getOperand(0)); const auto b = value(ci->getOperand(1)); + const bool a_notnan = llvm::cast(ci->getOperand(2))->getZExtValue() != 0; + const bool b_notnan = llvm::cast(ci->getOperand(3))->getZExtValue() != 0; if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { - if (a.value == b.value) + if (a.value == b.value || (a_notnan && b_notnan)) { return eval(a * b); } + if (a_notnan) + { + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + return eval(bitcast(bitcast(a * b) & ma)); + } + else if (b_notnan) + { + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + return eval(bitcast(bitcast(a * b) & mb)); + } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); const auto mb = sext(fcmp_uno(b != fsplat(0.))); return eval(bitcast(bitcast(a * b) & ma & mb)); @@ -7267,10 +7272,13 @@ public: } }); + const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0; + const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0; + if (op.ra == op.rb && !m_interp_magn) { const auto a = get_vr(op.ra); - set_vr(op.rt, fm(a, a)); + set_vr(op.rt, fm(a, a, splat(a_notnan), splat(a_notnan))); return; } @@ -7309,7 +7317,7 @@ public: } } - set_vr(op.rt, fm(a, b)); + set_vr(op.rt, fm(a, b, splat(a_notnan), splat(b_notnan))); } template @@ -7602,10 +7610,10 @@ public: set_vr(op.rt4, fnms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); } - template - static llvm_calli fma(T&& a, U&& b, V&& c) + template , typename X = llvm_place_stealer_t> + static llvm_calli fma(T&& a, U&& b, V&& c, W&& d = match_stealer(), X&& e = match_stealer()) { - return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}.set_order_equality_hint(1, 1, 0); + return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c), std::forward(d), std::forward(e)}}.set_order_equality_hint(1, 1, 2, 3, 4); } template @@ -7624,14 +7632,35 @@ public: return; } + register_intrinsic("spu_fma", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); const auto b = value(ci->getOperand(1)); const auto c = value(ci->getOperand(2)); - + const bool a_notnan = llvm::cast(ci->getOperand(3))->getZExtValue() != 0; + const bool b_notnan = llvm::cast(ci->getOperand(4))->getZExtValue() != 0; + if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { + if (a.value == b.value || (a_notnan && b_notnan)) + { + return fma32x4(a, b, c); + } + + if (a_notnan) + { + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto cb = bitcast(bitcast(b) & ma); + return fma32x4(a, eval(cb), c); + } + else if (b_notnan) + { + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + const auto ca = bitcast(bitcast(a) & mb); + return fma32x4(eval(ca), b, c); + } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); const auto mb = sext(fcmp_uno(b != fsplat(0.))); const auto ca = bitcast(bitcast(a) & mb); @@ -7680,6 +7709,9 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); static const auto MT = match(); + const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0; + const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0; + auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { auto match_fnms = [&](f32 float_value) @@ -7875,7 +7907,13 @@ public: spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos); } - set_vr(op.rt4, fma(a, b, c)); + if (!m_interp_magn && op.ra == op.rb) + { + set_vr(op.rt4, fma(a, a, c, splat(a_notnan), splat(a_notnan))); + return; + } + + set_vr(op.rt4, fma(a, b, c, splat(a_notnan), splat(b_notnan))); } template diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 00d89127d6..1fd295f2f6 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -742,6 +742,9 @@ protected: // Set if register is used in floating pont instruction std::bitset reg_maybe_float{}; + // Set if register is used as shuffle mask + std::bitset reg_maybe_shuffle_mask{}; + // Number of times registers are used (before modified) std::array reg_use{}; From 2f701019dd32b00fa9af40f3b84d0ef6b4274fd1 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Tue, 10 Mar 2026 17:13:44 +0200 Subject: [PATCH 07/11] SPU LLVM: Classify SPU memory and context memory instructions --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 65 +++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index baecbd3135..9c543915bf 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -132,6 +132,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::MDNode* m_md_unlikely; llvm::MDNode* m_md_likely; + llvm::MDNode* m_md_spu_memory_domain; + llvm::MDNode* m_md_spu_context_domain; struct block_info { @@ -555,6 +557,40 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return _ptr(m_thread, ::offset32(offset_args...)); } + template + T* spu_mem_attr(T* inst) + { + if (auto load_inst = llvm::dyn_cast(inst)) + { + load_inst->setMetadata(llvm::LLVMContext::MD_noalias, m_md_spu_context_domain); + load_inst->setMetadata(llvm::LLVMContext::MD_alias_scope, m_md_spu_memory_domain); + } + else if (auto store_inst = llvm::dyn_cast(inst)) + { + store_inst->setMetadata(llvm::LLVMContext::MD_noalias, m_md_spu_context_domain); + store_inst->setMetadata(llvm::LLVMContext::MD_alias_scope, m_md_spu_memory_domain); + } + + return inst; + } + + template + T* spu_context_attr(T* inst) + { + if (auto load_inst = llvm::dyn_cast(inst)) + { + load_inst->setMetadata(llvm::LLVMContext::MD_alias_scope, m_md_spu_context_domain); + load_inst->setMetadata(llvm::LLVMContext::MD_noalias, m_md_spu_memory_domain); + } + else if (auto store_inst = llvm::dyn_cast(inst)) + { + store_inst->setMetadata(llvm::LLVMContext::MD_alias_scope, m_md_spu_context_domain); + store_inst->setMetadata(llvm::LLVMContext::MD_noalias, m_md_spu_memory_domain); + } + + return inst; + } + // Return default register type llvm::Type* get_reg_type(u32 index) { @@ -722,6 +758,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Load register value if necessary reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index)); + spu_context_attr(reg); } if (reg->getType() == get_type()) @@ -955,6 +992,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Write register to the context _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_reg_type(index)), addr); + + spu_context_attr(_store); } template @@ -1065,7 +1104,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Update PC for current or explicitly specified instruction address void update_pc(u32 target = -1) { - m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(&spu_thread::pc))->setVolatile(true); + spu_context_attr(m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(&spu_thread::pc)))->setVolatile(true); } // Call cpu_thread::check_state if necessary and return or continue (full check) @@ -1074,7 +1113,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto pstate = spu_ptr(&spu_thread::state); const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); const auto check = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(get_type(), pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely); + m_ir->CreateCondBr(m_ir->CreateICmpEQ(spu_context_attr(m_ir->CreateLoad(get_type(), pstate, true)), m_ir->getInt32(0)), _body, check, m_md_likely); m_ir->SetInsertPoint(check); update_pc(addr); @@ -1085,14 +1124,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (may_be_unsafe_for_savestate) { - m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable))->setVolatile(true); + spu_context_attr(m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable)))->setVolatile(true); } m_ir->CreateCall(m_test_state, {m_thread}); if (may_be_unsafe_for_savestate) { - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable))->setVolatile(true); + spu_context_attr(m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable)))->setVolatile(true); } m_ir->CreateBr(_body); @@ -1528,6 +1567,16 @@ public: m_md_likely = llvm::MDTuple::get(m_context, {md_name, md_high, md_low}); m_md_unlikely = llvm::MDTuple::get(m_context, {md_name, md_low, md_high}); + const auto domain = llvm::MDNode::getDistinct(m_context, {llvm::MDString::get(m_context, "SPU_mem")}); + const auto scope = llvm::MDNode::get(m_context, {llvm::MDString::get(m_context, "SPU_mem_scope"), domain}); + + m_md_spu_memory_domain = llvm::MDNode::get(m_context, scope); + + const auto domain2 = llvm::MDNode::getDistinct(m_context, {llvm::MDString::get(m_context, "SPU_ctx")}); + const auto scope2 = llvm::MDNode::get(m_context, {llvm::MDString::get(m_context, "SPU_ctx_scope"), domain2}); + + m_md_spu_context_domain = llvm::MDNode::get(m_context, scope2); + // Initialize transform passes clear_transforms(); #ifdef ARCH_ARM64 @@ -3175,6 +3224,8 @@ public: m_ir->SetInsertPoint(ins); auto si = llvm::cast(m_ir->Insert(bs->clone())); + spu_context_attr(si); + if (b2->store[i] == nullptr) { // Protect against backwards ordering now @@ -3240,7 +3291,7 @@ public: continue; m_ir->SetInsertPoint(ins); - m_ir->Insert(bs->clone()); + m_ir->Insert(spu_context_attr(bs->clone())); } bs->eraseFromParent(); @@ -8336,13 +8387,13 @@ public: void make_store_ls(value_t addr, value_t data) { const auto bswapped = byteswap(data); - m_ir->CreateStore(bswapped.eval(m_ir), _ptr(m_lsptr, addr.value)); + spu_mem_attr(m_ir->CreateStore(bswapped.eval(m_ir), _ptr(m_lsptr, addr.value))); } auto make_load_ls(value_t addr) { value_t data; - data.value = m_ir->CreateLoad(get_type(), _ptr(m_lsptr, addr.value)); + data.value = spu_mem_attr(m_ir->CreateLoad(get_type(), _ptr(m_lsptr, addr.value))); return byteswap(data); } From b088dcbb95839345719ad105ed3121814b2712f5 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Wed, 11 Mar 2026 16:11:45 +0200 Subject: [PATCH 08/11] SPU Debugger: Measure rate of block ran --- rpcs3/Emu/CPU/CPUThread.cpp | 7 ++-- rpcs3/Emu/CPU/CPUThread.h | 2 +- rpcs3/Emu/Cell/PPUThread.cpp | 9 ++--- rpcs3/Emu/Cell/PPUThread.h | 2 +- rpcs3/Emu/Cell/SPUThread.cpp | 59 +++++++++++++++++++++++++++++--- rpcs3/Emu/Cell/SPUThread.h | 2 +- rpcs3/Emu/RSX/RSXThread.cpp | 6 ++-- rpcs3/Emu/RSX/RSXThread.h | 2 +- rpcs3/rpcs3qt/debugger_frame.cpp | 5 ++- rpcs3/rpcs3qt/debugger_frame.h | 2 ++ 10 files changed, 73 insertions(+), 23 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index 3ab011aa04..eb6f3498a2 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -1322,8 +1322,9 @@ extern std::shared_ptr make_disasm(const cpu_thread* cpu, shared_ptr< void cpu_thread::dump_all(std::string& ret) const { std::any func_data; + std::any misc_data; - ret += dump_misc(); + dump_misc(ret, misc_data); ret += '\n'; dump_regs(ret, func_data); ret += '\n'; @@ -1371,9 +1372,9 @@ std::vector> cpu_thread::dump_callstack_list() const return {}; } -std::string cpu_thread::dump_misc() const +void cpu_thread::dump_misc(std::string& ret, std::any& /*custom_data*/) const { - return fmt::format("%s[0x%x]; State: %s\n", get_class() == thread_class::ppu ? "PPU" : get_class() == thread_class::spu ? "SPU" : "RSX", id, state.load()); + fmt::append(ret, "%s[0x%x]; State: %s\n", get_class() == thread_class::ppu ? "PPU" : get_class() == thread_class::spu ? "SPU" : "RSX", id, state.load()); } bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h index 5e3484f7f5..88995f5e3b 100644 --- a/rpcs3/Emu/CPU/CPUThread.h +++ b/rpcs3/Emu/CPU/CPUThread.h @@ -176,7 +176,7 @@ public: virtual std::vector> dump_callstack_list() const; // Get CPU dump of misc information - virtual std::string dump_misc() const; + virtual void dump_misc(std::string& ret, std::any& /*custom_data*/) const; // Thread entry point function virtual void cpu_task() = 0; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 4d690b344d..0062bc2825 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1364,9 +1364,7 @@ void ppu_thread::dump_regs(std::string& ret, std::any& custom_data) const u32 preferred_cr_field_index = 7; }; - dump_registers_data_t* func_data = nullptr; - - func_data = std::any_cast(&custom_data); + dump_registers_data_t* func_data = std::any_cast(&custom_data); if (!func_data) { @@ -2039,9 +2037,9 @@ std::vector> ppu_thread::dump_callstack_list() const return call_stack_list; } -std::string ppu_thread::dump_misc() const +void ppu_thread::dump_misc(std::string& ret, std::any& custom_data) const { - std::string ret = cpu_thread::dump_misc(); + cpu_thread::dump_misc(ret, custom_data); if (ack_suspend) { @@ -2096,7 +2094,6 @@ std::string ppu_thread::dump_misc() const { ret += '\n'; } - return ret; } void ppu_thread::dump_all(std::string& ret) const diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index 97c705aed5..cf5b91c487 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -145,7 +145,7 @@ public: virtual void dump_regs(std::string&, std::any& custom_data) const override; virtual std::string dump_callstack() const override; virtual std::vector> dump_callstack_list() const override; - virtual std::string dump_misc() const override; + virtual void dump_misc(std::string& ret, std::any& custom_data) const override; virtual void dump_all(std::string&) const override; virtual void cpu_task() override final; virtual void cpu_sleep() override; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index e4346bc3df..60e0f99cca 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1139,11 +1139,62 @@ std::vector> spu_thread::dump_callstack_list() const return call_stack_list; } -std::string spu_thread::dump_misc() const +void spu_thread::dump_misc(std::string& ret, std::any& custom_data) const { - std::string ret = cpu_thread::dump_misc(); + cpu_thread::dump_misc(ret, custom_data); - fmt::append(ret, "Block Weight: %u (Retreats: %u)", block_counter, block_failure); + struct dump_misc_data_t + { + u32 cpu_id = umax; + u64 last_read_time = umax; + u64 last_block_counter = umax; + u64 update_count = 0; + + std::pair update(u64 current_block_counter, u64 current_timestamp = get_system_time()) + { + const u64 diff_time = current_timestamp <= last_read_time ? 0 : current_timestamp - last_read_time; + const u64 diff_block = current_block_counter <= last_block_counter ? 0 : current_block_counter - last_block_counter; + + if (last_read_time == umax || update_count >= 1000) + { + last_read_time = current_timestamp; + last_block_counter = current_block_counter; + update_count = 0; + } + else if (diff_time >= 100000 && diff_block >= 100) + { + // Update values to measure rate (but not fully so rate can be measured later) + last_read_time += diff_time / 10 * 9; + last_block_counter += diff_block / 10 * 9; + update_count++; + } + + return {diff_time, diff_block}; + } + }; + + dump_misc_data_t* func_data = std::any_cast(&custom_data); + + if (!func_data) + { + custom_data.reset(); + custom_data = std::make_any(); + func_data = ensure(std::any_cast(&custom_data)); + } + + if (func_data->cpu_id != this->id) + { + *func_data = {}; + func_data->cpu_id = this->id; + } + + const u64 current_block_counter = atomic_storage::load(block_counter); + + const auto [diff_time, diff_block] = func_data->update(current_block_counter); + + const u64 rate_of_diff = diff_block ? std::max(1, utils::rational_mul(diff_block, 1'000'000, std::max(diff_time, 1))) : 0; + + fmt::append(ret, "Block Weight: log10(%u/second): %.1f (Retreats: %u)", rate_of_diff, std::log10(std::max(rate_of_diff, 10)), block_failure); if (u64 hash = atomic_storage::load(block_hash)) { @@ -1194,8 +1245,6 @@ std::string spu_thread::dump_misc() const break; } } - - return ret; } void spu_thread::cpu_on_stop() diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 9596f7b006..889d6f291c 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -630,7 +630,7 @@ public: virtual void dump_regs(std::string&, std::any& custom_data) const override; virtual std::string dump_callstack() const override; virtual std::vector> dump_callstack_list() const override; - virtual std::string dump_misc() const override; + virtual void dump_misc(std::string& ret, std::any& custom_data) const override; virtual void cpu_task() override final; virtual void cpu_on_stop() override; virtual void cpu_return() override; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 7d634de677..89506095a5 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2768,9 +2768,9 @@ namespace rsx recovered_fifo_cmds_history.push({fifo_ctrl->last_cmd(), current_time}); } - std::string thread::dump_misc() const + void thread::dump_misc(std::string& ret, std::any& custom_data) const { - std::string ret = cpu_thread::dump_misc(); + cpu_thread::dump_misc(ret, custom_data); const auto flags = +state; @@ -2783,8 +2783,6 @@ namespace rsx { fmt::append(ret, "\n"); } - - return ret; } std::vector> thread::dump_callstack_list() const diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 4b2de0acc4..f20e27ceae 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -122,7 +122,7 @@ namespace rsx std::unique_ptr fifo_ctrl; atomic_t rsx_thread_running{ false }; std::vector> dump_callstack_list() const override; - std::string dump_misc() const override; + void dump_misc(std::string& ret, std::any& custom_data) const override; protected: FIFO::flattening_helper m_flattener; diff --git a/rpcs3/rpcs3qt/debugger_frame.cpp b/rpcs3/rpcs3qt/debugger_frame.cpp index 0859e19856..db4efa1aae 100644 --- a/rpcs3/rpcs3qt/debugger_frame.cpp +++ b/rpcs3/rpcs3qt/debugger_frame.cpp @@ -1469,8 +1469,11 @@ void debugger_frame::WritePanels(cpu_thread* cpu) int loc = m_misc_state->verticalScrollBar()->value(); int hloc = m_misc_state->horizontalScrollBar()->value(); + + m_last_misc_state.clear(); + cpu->dump_misc(m_last_misc_state, m_dump_misc_func_data); m_misc_state->clear(); - m_misc_state->setPlainText(QString::fromStdString(cpu->dump_misc())); + m_misc_state->setPlainText(QString::fromStdString(m_last_misc_state)); m_misc_state->verticalScrollBar()->setValue(loc); m_misc_state->horizontalScrollBar()->setValue(hloc); diff --git a/rpcs3/rpcs3qt/debugger_frame.h b/rpcs3/rpcs3qt/debugger_frame.h index eb22d74680..abe7ec11cc 100644 --- a/rpcs3/rpcs3qt/debugger_frame.h +++ b/rpcs3/rpcs3qt/debugger_frame.h @@ -69,7 +69,9 @@ class debugger_frame : public custom_dock_widget u32 m_last_pc = -1; std::vector m_last_query_state; std::string m_last_reg_state; + std::string m_last_misc_state; std::any m_dump_reg_func_data; + std::any m_dump_misc_func_data; std::vector> m_threads_info; u32 m_last_step_over_breakpoint = -1; u64 m_ui_update_ctr = 0; From e866e3555dd598451340e46bed8c5d04c1ef71e0 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Mon, 16 Mar 2026 08:18:53 +0200 Subject: [PATCH 09/11] LLVM: Try to reuse BitCasts --- rpcs3/Emu/CPU/CPUTranslator.cpp | 58 +++++++++++++++++++++++++++- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 2 +- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index 08e8e9ad30..22413f62b8 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -225,12 +225,66 @@ llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type) const fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2); } - if (const auto c1 = llvm::dyn_cast(val)) + if (val->getType() == type) + { + return val; + } + + llvm::CastInst* i; + llvm::Value* source_val = val; + + // Try to reuse older bitcasts + while ((i = llvm::dyn_cast_or_null(source_val)) && i->getOpcode() == llvm::Instruction::BitCast) + { + source_val = i->getOperand(0); + + if (source_val->getType() == type) + { + return source_val; + } + } + + for (auto it = source_val->use_begin(); it != source_val->use_end(); ++it) + { + llvm::Value* it_val = *it; + + if (!it_val) + { + continue; + } + + llvm::CastInst* bci = llvm::dyn_cast_or_null(it_val); + + // Walk through bitcasts + while (bci && bci->getOpcode() == llvm::Instruction::BitCast) + { + if (bci->getParent() != m_ir->GetInsertBlock()) + { + break; + } + + if (bci->getType() == type) + { + return bci; + } + + if (bci->use_begin() == bci->use_end()) + { + break; + } + + bci = llvm::dyn_cast_or_null(*bci->use_begin()); + } + } + + // Do bitcast on the source + + if (const auto c1 = llvm::dyn_cast(source_val)) { return ensure(llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout())); } - return m_ir->CreateBitCast(val, type); + return m_ir->CreateBitCast(source_val, type); } template <> diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 9c543915bf..7511b2ae7b 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -991,7 +991,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Write register to the context - _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_reg_type(index)), addr); + _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : bitcast(value, get_reg_type(index)), addr); spu_context_attr(_store); } From 91f4ce12c5ff67da628138f9157d13c7e9e8ab2c Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Thu, 19 Mar 2026 10:49:55 +0200 Subject: [PATCH 10/11] SPU Analyzer: Add pure opcode tag --- rpcs3/Emu/Cell/SPUAnalyser.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index a6c788faad..1598551c7d 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -13,6 +13,7 @@ struct spu_itype static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values static constexpr struct zregmod_tag{} zregmod{}; // Instructions not modifying any GPR + static constexpr struct pure_tag{} pure{}; // Instructions that always produce the same values as long as arguments are equal enum class type : unsigned char { @@ -301,6 +302,12 @@ struct spu_itype { return (value >= HEQ && value <= STQR) || (value >= BR && value <= BIHNZ); } + + // Test for instructions which always produce the same values as long as arguments and immediate values are equal + friend constexpr bool operator &(type value, pure_tag) + { + return (value >= ILH && value <= CLGTI); + } }; using spu_itype_t = spu_itype::type; From f41770f57a58d044fd730229a095e98bd6f0bc41 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:47:32 +0200 Subject: [PATCH 11/11] Thread.cpp: Fix game exit on access violation Fixes game closing in Twisted Metal. This commit also aloows to debug such states by using cpu_flag::req_exit as a broker --- Utilities/Thread.cpp | 71 +++++++++++++++++++++++++----- rpcs3/Emu/CPU/CPUThread.cpp | 12 ++++- rpcs3/Emu/CPU/CPUThread.h | 3 +- rpcs3/Emu/Cell/Modules/cellGem.cpp | 6 +++ rpcs3/Emu/System.cpp | 14 ++++-- rpcs3/rpcs3qt/debugger_frame.cpp | 11 +++-- rpcs3/rpcs3qt/main_window.cpp | 4 +- 7 files changed, 99 insertions(+), 22 deletions(-) diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index 6395c32505..3458a2ac1f 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -1269,7 +1269,7 @@ namespace rsx extern std::function g_access_violation_handler; } -bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noexcept +bool handle_access_violation(u32 addr, bool is_writing, bool is_exec, ucontext_t* context) noexcept { g_tls_fault_all++; @@ -1503,7 +1503,9 @@ bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noe static_cast(context); #endif /* ARCH_ */ - if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable)) + const auto requred_page_perms = (is_writing ? vm::page_writable : vm::page_readable) + (is_exec ? vm::page_executable : 0); + + if (vm::check_addr(addr, requred_page_perms)) { return true; } @@ -1511,9 +1513,7 @@ bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noe // Hack: allocate memory in case the emulator is stopping const auto hack_alloc = [&]() { - g_tls_access_violation_recovered = true; - - if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable)) + if (vm::check_addr(addr, requred_page_perms)) { return true; } @@ -1525,14 +1525,42 @@ bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noe return false; } + extern void ppu_register_range(u32 addr, u32 size); + + bool reprotected = false; + if (vm::writer_lock mlock; area->flags & vm::preallocated || vm::check_addr(addr, 0)) { // For allocated memory with protection lower than required (such as protection::no or read-only while writing to it) utils::memory_protect(vm::base(addr & -0x1000), 0x1000, utils::protection::rw); + reprotected = true; + } + + if (reprotected) + { + if (is_exec && !vm::check_addr(addr, vm::page_executable)) + { + ppu_register_range(addr & -0x10000, 0x10000); + } + + g_tls_access_violation_recovered = true; return true; } - return area->falloc(addr & -0x10000, 0x10000) || vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable); + const bool allocated = area->falloc(addr & -0x10000, 0x10000); + + if (allocated) + { + if (is_exec && !vm::check_addr(addr, vm::page_executable)) + { + ppu_register_range(addr & -0x10000, 0x10000); + } + + g_tls_access_violation_recovered = true; + return true; + } + + return false; }; if (cpu && (cpu->get_class() == thread_class::ppu || cpu->get_class() == thread_class::spu)) @@ -1766,8 +1794,13 @@ bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noe } } - if (Emu.IsStopped() && !hack_alloc()) + if (Emu.IsStopped()) { + while (!hack_alloc()) + { + thread_ctrl::wait_for(1000); + } + return false; } @@ -1806,6 +1839,7 @@ static LONG exception_handler(PEXCEPTION_POINTERS pExp) noexcept if (pExp->ExceptionRecord->ExceptionCode == EXCEPTION_ACCESS_VIOLATION && !is_executing) { u32 addr = 0; + bool is_exec = false; if (auto [addr0, ok] = vm::try_get_addr(ptr); ok) { @@ -1813,14 +1847,21 @@ static LONG exception_handler(PEXCEPTION_POINTERS pExp) noexcept } else if (const usz exec64 = (ptr - vm::g_exec_addr) / 2; exec64 <= u32{umax}) { + is_exec = true; addr = static_cast(exec64); } - else + else if (const usz exec64 = (ptr - vm::g_exec_addr - vm::g_exec_addr_seg_offset); exec64 <= u32{umax}) { + is_exec = true; + addr = static_cast(exec64); + } + else + { + std::this_thread::sleep_for(1ms); return EXCEPTION_CONTINUE_SEARCH; } - if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, pExp->ContextRecord)) + if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, is_exec, pExp->ContextRecord)) { return EXCEPTION_CONTINUE_EXECUTION; } @@ -2027,12 +2068,13 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept #endif const u64 exec64 = (reinterpret_cast(info->si_addr) - reinterpret_cast(vm::g_exec_addr)) / 2; + const u64 exec64_2 = (reinterpret_cast(info->si_addr) - reinterpret_cast(vm::g_exec_addr)) - vm::g_exec_addr_seg_offset; const auto cause = is_executing ? "executing" : is_writing ? "writing" : "reading"; if (auto [addr, ok] = vm::try_get_addr(info->si_addr); ok && !is_executing) { // Try to process access violation - if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, context)) + if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, false, context)) { return; } @@ -2040,7 +2082,14 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept if (exec64 < 0x100000000ull && !is_executing) { - if (thread_ctrl::get_current() && handle_access_violation(static_cast(exec64), is_writing, context)) + if (thread_ctrl::get_current() && handle_access_violation(static_cast(exec64), is_writing, true, context)) + { + return; + } + } + else if (exec64_2 < 0x100000000ull && !is_executing) + { + if (thread_ctrl::get_current() && handle_access_violation(static_cast(exec64_2), is_writing, true, context)) { return; } diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index eb6f3498a2..4bd5fc9157 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -888,6 +888,14 @@ bool cpu_thread::check_state() noexcept store = true; } + if (flags & cpu_flag::req_exit) + { + // A request for the thread to quit has been made + flags -= cpu_flag::req_exit; + flags += cpu_flag::exit; + store = true; + } + // Can't process dbg_step if we only paused temporarily if (cpu_can_stop && flags & cpu_flag::dbg_step) { @@ -1157,13 +1165,13 @@ void cpu_thread::notify() cpu_thread& cpu_thread::operator=(thread_state) { - if (state & cpu_flag::exit) + if (state & (cpu_flag::exit + cpu_flag::req_exit)) { // Must be notified elsewhere or self-raised return *this; } - const auto old = state.fetch_add(cpu_flag::exit); + const auto old = state.fetch_add(cpu_flag::req_exit); if (old & cpu_flag::wait && old.none_of(cpu_flag::again + cpu_flag::exit)) { diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h index 88995f5e3b..e723fd2d4b 100644 --- a/rpcs3/Emu/CPU/CPUThread.h +++ b/rpcs3/Emu/CPU/CPUThread.h @@ -29,6 +29,7 @@ enum class cpu_flag : u32 yield, // Thread is being requested to yield its execution time if it's running preempt, // Thread is being requested to preempt the execution of all CPU threads + req_exit, // Request the thread to exit dbg_global_pause, // Emulation paused dbg_pause, // Thread paused dbg_step, // Thread forced to pause after one step (one instruction, etc) @@ -39,7 +40,7 @@ enum class cpu_flag : u32 // Test stopped state constexpr bool is_stopped(bs_t state) { - return !!(state & (cpu_flag::stop + cpu_flag::exit + cpu_flag::again)); + return !!(state & (cpu_flag::stop + cpu_flag::exit + cpu_flag::again + cpu_flag::req_exit)); } // Test paused state diff --git a/rpcs3/Emu/Cell/Modules/cellGem.cpp b/rpcs3/Emu/Cell/Modules/cellGem.cpp index d45dace1ca..f9f5ea4100 100644 --- a/rpcs3/Emu/Cell/Modules/cellGem.cpp +++ b/rpcs3/Emu/Cell/Modules/cellGem.cpp @@ -1774,6 +1774,12 @@ public: shared_mutex mutex; + gem_tracker& operator=(thread_state) noexcept + { + wake_up_tracker(); + return *this; + } + private: atomic_t m_wake_up_tracker = 0; atomic_t m_tracker_done = 0; diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 42854504f7..7818bd20d5 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -49,6 +49,7 @@ #include #include +#include #include "Utilities/JIT.h" @@ -3075,7 +3076,7 @@ void Emulator::GracefulShutdown(bool allow_autoexit, bool async_op, bool savesta std::vector>> ppu_thread_list; // If EXITGAME signal is not read, force kill after a second. - constexpr int loop_timeout_ms = 50; + constexpr int loop_timeout_ms = 16; int kill_timeout_ms = 1000; int elapsed_ms = 0; @@ -3092,8 +3093,10 @@ void Emulator::GracefulShutdown(bool allow_autoexit, bool async_op, bool savesta Resume(); }, nullptr, true, read_counter); + std::shared_lock rlock(id_manager::g_mutex, std::defer_lock); + // Check if the EXITGAME signal was read. We allow the game to terminate itself if that's the case. - if (!read_sysutil_signal && read_counter != get_sysutil_cb_manager_read_count()) + if (!read_sysutil_signal && read_counter != get_sysutil_cb_manager_read_count() && rlock.try_lock()) { sys_log.notice("The game received the exit request. Waiting for it to terminate itself..."); kill_timeout_ms += 5000; // Grant a couple more seconds @@ -3103,7 +3106,12 @@ void Emulator::GracefulShutdown(bool allow_autoexit, bool async_op, bool savesta idm::select>([&](u32 id, cpu_thread&) { ppu_thread_list.emplace_back(idm::get_unlocked>(id)); - }); + }, idm::unlocked); + } + + if (rlock) + { + rlock.unlock(); } if (static_cast(info) != m_stop_ctr || Emu.IsStopped()) diff --git a/rpcs3/rpcs3qt/debugger_frame.cpp b/rpcs3/rpcs3qt/debugger_frame.cpp index db4efa1aae..b8ed1d642f 100644 --- a/rpcs3/rpcs3qt/debugger_frame.cpp +++ b/rpcs3/rpcs3qt/debugger_frame.cpp @@ -872,7 +872,7 @@ std::function debugger_frame::make_check_cpu(cpu_thread* cpu, boo { constexpr cpu_thread* null_cpu = nullptr; - if (Emu.IsStopped()) + if (Emu.IsStopped(true)) { return []() { return null_cpu; }; } @@ -921,7 +921,7 @@ std::function debugger_frame::make_check_cpu(cpu_thread* cpu, boo return [cpu, type, shared = std::move(shared), emulation_id = Emu.GetEmulationIdentifier()]() mutable -> cpu_thread* { - if (emulation_id != Emu.GetEmulationIdentifier() || Emu.IsStopped()) + if (emulation_id != Emu.GetEmulationIdentifier() || Emu.IsStopped(true)) { // Invalidate all data after Emu.Kill() shared.reset(); @@ -1040,7 +1040,7 @@ void debugger_frame::UpdateUnitList() const u64 emulation_id = static_cast>(Emu.GetEmulationIdentifier()); const u64 threads_created = cpu_thread::g_threads_created; const u64 threads_deleted = cpu_thread::g_threads_deleted; - const system_state emu_state = Emu.GetStatus(); + const system_state emu_state = Emu.GetStatus(false); std::unique_lock lock{id_manager::g_mutex, std::defer_lock}; @@ -1077,6 +1077,11 @@ void debugger_frame::UpdateUnitList() { std::function func_cpu = make_check_cpu(std::addressof(cpu), true); + if (cpu.state & cpu_flag::exit) + { + return; + } + // Space at the end is to pad a gap on the right cpu_list.emplace_back(QString::fromStdString((id >> 24 == 0x55 ? "RSX[0x55555555]" : cpu.get_name()) + ' '), std::move(func_cpu)); diff --git a/rpcs3/rpcs3qt/main_window.cpp b/rpcs3/rpcs3qt/main_window.cpp index 8f9324d73e..94157571e8 100644 --- a/rpcs3/rpcs3qt/main_window.cpp +++ b/rpcs3/rpcs3qt/main_window.cpp @@ -128,7 +128,7 @@ extern void qt_events_aware_op(int repeat_duration_ms, std::function wra } else { - QTimer::singleShot(repeat_duration_ms, *check_iteration); + QTimer::singleShot(repeat_duration_ms, event_loop, *check_iteration); } }); @@ -138,7 +138,7 @@ extern void qt_events_aware_op(int repeat_duration_ms, std::function wra event_loop = new QEventLoop(); // Queue event initially - QTimer::singleShot(0, *check_iteration); + QTimer::singleShot(0, event_loop, *check_iteration); // Event loop event_loop->exec();