diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 4a0028c571..738932808d 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -567,6 +567,32 @@ struct llvm_placeholder_t } }; +template >> +struct llvm_place_stealer_t +{ + // TODO: placeholder extracting actual constant values (u64, f64, vector, etc) + + using type = T; + + static constexpr bool is_ok = true; + + llvm::Value* eval(llvm::IRBuilder<>*) const + { + return nullptr; + } + + std::tuple<> match(llvm::Value*& value, llvm::Module*) const + { + if (value && value->getType() == llvm_value_t::get_type(value->getContext())) + { + return {}; + } + + value = nullptr; + return {}; + } +}; + template struct llvm_const_int { @@ -3227,6 +3253,12 @@ public: return {}; } + template + static llvm_place_stealer_t match_stealer() + { + return {}; + } + template requires requires { typename llvm_common_t; } static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr) diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 9da576f009..96d4afa95e 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -3958,6 +3958,11 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s block.reg_maybe_float.set(reg); } + if (type == spu_itype::SHUFB && reg == op.rc) + { + block.reg_maybe_shuffle_mask.set(reg); + } + block.reg_use[reg]++; if (reg_save != reg && block.reg_save_dom[reg]) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 71217fb0bf..baecbd3135 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -175,6 +175,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const usz first_id = store_context_first_id[i]; return counter != 1 && first_id != umax && counter < first_id; } + + bool is_gpr_not_NaN_hint(u32 i) const noexcept + { + return block_wide_reg_store_elimination && bb->reg_maybe_float[i] && bb->reg_use[i] >= 3 && !bb->reg_mod[i]; + } }; struct function_info @@ -1692,7 +1697,7 @@ public: // Emit state check const auto pstate = spu_ptr(&spu_thread::state); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); + m_ir->CreateCondBr(m_ir->CreateICmpNE(spu_context_attr(m_ir->CreateLoad(get_type(), pstate)), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); // Emit code check u32 check_iterations = 0; @@ -6915,8 +6920,13 @@ public: return eval(bitcast(min(bitcast(v),splat(0xff7fffff)))); } - value_t clamp_smax(value_t v) + value_t clamp_smax(value_t v, u32 gpr = s_reg_max) { + if (m_block && gpr < s_reg_max && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(gpr)) + { + return v; + } + if (m_use_avx512) { if (is_input_positive(v)) @@ -6936,16 +6946,6 @@ public: return eval(clamp_positive_smax(clamp_negative_smax(v))); } - // FMA favouring zeros - value_t xmuladd(value_t a, value_t b, value_t c) - { - const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); - const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); - const auto ca = eval(bitcast(bitcast(a) & mb)); - const auto cb = eval(bitcast(bitcast(b) & ma)); - return eval(fmuladd(ca, cb, c)); - } - // Checks for postive and negative zero, or Denormal (treated as zero) // If sign is +-1 check equality againts all sign bits bool is_spu_float_zero(v128 a, int sign = 0) @@ -7032,12 +7032,6 @@ public: set_vr(op.rt, frsqest(get_vr(op.ra))); } - template - static llvm_calli fcgt(T&& a, U&& b) - { - return {"spu_fcgt", {std::forward(a), std::forward(b)}}; - } - void FCGT(spu_opcode_t op) { if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) @@ -7046,11 +7040,8 @@ public: return; } - register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci) + const auto fcgt = [&](value_t a, value_t b) { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - const value_t ab[2]{a, b}; std::bitset<2> safe_int_compare(0); @@ -7082,6 +7073,16 @@ public: } } + if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra)) + { + safe_finite_compare.set(0); + } + + if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb)) + { + safe_finite_compare.set(1); + } + if (safe_int_compare.any()) { return eval(sext(bitcast(a) > bitcast(b))); @@ -7101,7 +7102,7 @@ public: const auto bi = eval(bitcast(b)); return eval(sext(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); - }); + }; set_vr(op.rt, fcgt(get_vr(op.ra), get_vr(op.rb))); } @@ -7198,12 +7199,6 @@ public: set_vr(op.rt, fa(get_vr(op.ra), get_vr(op.rb))); } - template - static llvm_calli fs(T&& a, U&& b) - { - return {"spu_fs", {std::forward(a), std::forward(b)}}; - } - void FS(spu_opcode_t op) { if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) @@ -7212,29 +7207,26 @@ public: return; } - register_intrinsic("spu_fs", [&](llvm::CallInst* ci) + const auto fs = [&](value_t a, value_t b) { - const auto a = value(ci->getOperand(0)); - const auto b = value(ci->getOperand(1)); - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { - const auto bc = clamp_smax(b); // for #4478 + const auto bc = clamp_smax(b, op.rb); // for #4478 return eval(a - bc); } else { return eval(a - b); } - }); + }; set_vr(op.rt, fs(get_vr(op.ra), get_vr(op.rb))); } - template - static llvm_calli fm(T&& a, U&& b) + template , typename W = llvm_place_stealer_t> + static auto fm(T&& a, U&& b, V&& a_not_nan = match_stealer(), W&& b_not_nan = match_stealer()) { - return llvm_calli{"spu_fm", {std::forward(a), std::forward(b)}}.set_order_equality_hint(1, 1); + return llvm_calli{"spu_fm", {std::forward(a), std::forward(b), a_not_nan, b_not_nan}}.set_order_equality_hint(1, 1, 2, 3); } void FM(spu_opcode_t op) @@ -7249,14 +7241,27 @@ public: { const auto a = value(ci->getOperand(0)); const auto b = value(ci->getOperand(1)); + const bool a_notnan = llvm::cast(ci->getOperand(2))->getZExtValue() != 0; + const bool b_notnan = llvm::cast(ci->getOperand(3))->getZExtValue() != 0; if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { - if (a.value == b.value) + if (a.value == b.value || (a_notnan && b_notnan)) { return eval(a * b); } + if (a_notnan) + { + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + return eval(bitcast(bitcast(a * b) & ma)); + } + else if (b_notnan) + { + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + return eval(bitcast(bitcast(a * b) & mb)); + } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); const auto mb = sext(fcmp_uno(b != fsplat(0.))); return eval(bitcast(bitcast(a * b) & ma & mb)); @@ -7267,10 +7272,13 @@ public: } }); + const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0; + const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0; + if (op.ra == op.rb && !m_interp_magn) { const auto a = get_vr(op.ra); - set_vr(op.rt, fm(a, a)); + set_vr(op.rt, fm(a, a, splat(a_notnan), splat(a_notnan))); return; } @@ -7309,7 +7317,7 @@ public: } } - set_vr(op.rt, fm(a, b)); + set_vr(op.rt, fm(a, b, splat(a_notnan), splat(b_notnan))); } template @@ -7602,10 +7610,10 @@ public: set_vr(op.rt4, fnms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); } - template - static llvm_calli fma(T&& a, U&& b, V&& c) + template , typename X = llvm_place_stealer_t> + static llvm_calli fma(T&& a, U&& b, V&& c, W&& d = match_stealer(), X&& e = match_stealer()) { - return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}.set_order_equality_hint(1, 1, 0); + return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c), std::forward(d), std::forward(e)}}.set_order_equality_hint(1, 1, 2, 3, 4); } template @@ -7624,14 +7632,35 @@ public: return; } + register_intrinsic("spu_fma", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); const auto b = value(ci->getOperand(1)); const auto c = value(ci->getOperand(2)); - + const bool a_notnan = llvm::cast(ci->getOperand(3))->getZExtValue() != 0; + const bool b_notnan = llvm::cast(ci->getOperand(4))->getZExtValue() != 0; + if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) { + if (a.value == b.value || (a_notnan && b_notnan)) + { + return fma32x4(a, b, c); + } + + if (a_notnan) + { + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto cb = bitcast(bitcast(b) & ma); + return fma32x4(a, eval(cb), c); + } + else if (b_notnan) + { + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + const auto ca = bitcast(bitcast(a) & mb); + return fma32x4(eval(ca), b, c); + } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); const auto mb = sext(fcmp_uno(b != fsplat(0.))); const auto ca = bitcast(bitcast(a) & mb); @@ -7680,6 +7709,9 @@ public: const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); static const auto MT = match(); + const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0; + const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0; + auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool { auto match_fnms = [&](f32 float_value) @@ -7875,7 +7907,13 @@ public: spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos); } - set_vr(op.rt4, fma(a, b, c)); + if (!m_interp_magn && op.ra == op.rb) + { + set_vr(op.rt4, fma(a, a, c, splat(a_notnan), splat(a_notnan))); + return; + } + + set_vr(op.rt4, fma(a, b, c, splat(a_notnan), splat(b_notnan))); } template diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 00d89127d6..1fd295f2f6 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -742,6 +742,9 @@ protected: // Set if register is used in floating pont instruction std::bitset reg_maybe_float{}; + // Set if register is used as shuffle mask + std::bitset reg_maybe_shuffle_mask{}; + // Number of times registers are used (before modified) std::array reg_use{};