SPU LLVM: Optimize FM, FMA and FCGT in Reduced Loop

This commit is contained in:
Elad 2026-03-08 15:36:58 +02:00
parent e1eae3ee49
commit 37a07ae545
4 changed files with 124 additions and 46 deletions

View File

@ -567,6 +567,32 @@ struct llvm_placeholder_t
} }
}; };
template <typename T, typename U = llvm_common_t<llvm_value_t<T>>>
struct llvm_place_stealer_t
{
// TODO: placeholder extracting actual constant values (u64, f64, vector, etc)
using type = T;
static constexpr bool is_ok = true;
llvm::Value* eval(llvm::IRBuilder<>*) const
{
return nullptr;
}
std::tuple<> match(llvm::Value*& value, llvm::Module*) const
{
if (value && value->getType() == llvm_value_t<T>::get_type(value->getContext()))
{
return {};
}
value = nullptr;
return {};
}
};
template <typename T, bool ForceSigned = false> template <typename T, bool ForceSigned = false>
struct llvm_const_int struct llvm_const_int
{ {
@ -3227,6 +3253,12 @@ public:
return {}; return {};
} }
template <typename T>
static llvm_place_stealer_t<T> match_stealer()
{
return {};
}
template <typename T> template <typename T>
requires requires { typename llvm_common_t<T>; } requires requires { typename llvm_common_t<T>; }
static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr) static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr)

View File

@ -3958,6 +3958,11 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
block.reg_maybe_float.set(reg); block.reg_maybe_float.set(reg);
} }
if (type == spu_itype::SHUFB && reg == op.rc)
{
block.reg_maybe_shuffle_mask.set(reg);
}
block.reg_use[reg]++; block.reg_use[reg]++;
if (reg_save != reg && block.reg_save_dom[reg]) if (reg_save != reg && block.reg_save_dom[reg])

View File

@ -175,6 +175,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
const usz first_id = store_context_first_id[i]; const usz first_id = store_context_first_id[i];
return counter != 1 && first_id != umax && counter < first_id; return counter != 1 && first_id != umax && counter < first_id;
} }
bool is_gpr_not_NaN_hint(u32 i) const noexcept
{
return block_wide_reg_store_elimination && bb->reg_maybe_float[i] && bb->reg_use[i] >= 3 && !bb->reg_mod[i];
}
}; };
struct function_info struct function_info
@ -1692,7 +1697,7 @@ public:
// Emit state check // Emit state check
const auto pstate = spu_ptr(&spu_thread::state); const auto pstate = spu_ptr(&spu_thread::state);
m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); m_ir->CreateCondBr(m_ir->CreateICmpNE(spu_context_attr(m_ir->CreateLoad(get_type<u32>(), pstate)), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
// Emit code check // Emit code check
u32 check_iterations = 0; u32 check_iterations = 0;
@ -6915,8 +6920,13 @@ public:
return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff)))); return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
} }
value_t<f32[4]> clamp_smax(value_t<f32[4]> v) value_t<f32[4]> clamp_smax(value_t<f32[4]> v, u32 gpr = s_reg_max)
{ {
if (m_block && gpr < s_reg_max && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(gpr))
{
return v;
}
if (m_use_avx512) if (m_use_avx512)
{ {
if (is_input_positive(v)) if (is_input_positive(v))
@ -6936,16 +6946,6 @@ public:
return eval(clamp_positive_smax(clamp_negative_smax(v))); return eval(clamp_positive_smax(clamp_negative_smax(v)));
} }
// FMA favouring zeros
value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
{
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
return eval(fmuladd(ca, cb, c));
}
// Checks for postive and negative zero, or Denormal (treated as zero) // Checks for postive and negative zero, or Denormal (treated as zero)
// If sign is +-1 check equality againts all sign bits // If sign is +-1 check equality againts all sign bits
bool is_spu_float_zero(v128 a, int sign = 0) bool is_spu_float_zero(v128 a, int sign = 0)
@ -7032,12 +7032,6 @@ public:
set_vr(op.rt, frsqest(get_vr<f32[4]>(op.ra))); set_vr(op.rt, frsqest(get_vr<f32[4]>(op.ra)));
} }
template <typename T, typename U>
static llvm_calli<s32[4], T, U> fcgt(T&& a, U&& b)
{
return {"spu_fcgt", {std::forward<T>(a), std::forward<U>(b)}};
}
void FCGT(spu_opcode_t op) void FCGT(spu_opcode_t op)
{ {
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
@ -7046,11 +7040,8 @@ public:
return; return;
} }
register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci) const auto fcgt = [&](value_t<f32[4]> a, value_t<f32[4]> b)
{ {
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
const value_t<f32[4]> ab[2]{a, b}; const value_t<f32[4]> ab[2]{a, b};
std::bitset<2> safe_int_compare(0); std::bitset<2> safe_int_compare(0);
@ -7082,6 +7073,16 @@ public:
} }
} }
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra))
{
safe_finite_compare.set(0);
}
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb))
{
safe_finite_compare.set(1);
}
if (safe_int_compare.any()) if (safe_int_compare.any())
{ {
return eval(sext<s32[4]>(bitcast<s32[4]>(a) > bitcast<s32[4]>(b))); return eval(sext<s32[4]>(bitcast<s32[4]>(a) > bitcast<s32[4]>(b)));
@ -7101,7 +7102,7 @@ public:
const auto bi = eval(bitcast<s32[4]>(b)); const auto bi = eval(bitcast<s32[4]>(b));
return eval(sext<s32[4]>(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); return eval(sext<s32[4]>(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi)));
}); };
set_vr(op.rt, fcgt(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))); set_vr(op.rt, fcgt(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
} }
@ -7198,12 +7199,6 @@ public:
set_vr(op.rt, fa(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))); set_vr(op.rt, fa(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
} }
template <typename T, typename U>
static llvm_calli<f32[4], T, U> fs(T&& a, U&& b)
{
return {"spu_fs", {std::forward<T>(a), std::forward<U>(b)}};
}
void FS(spu_opcode_t op) void FS(spu_opcode_t op)
{ {
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate) if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
@ -7212,29 +7207,26 @@ public:
return; return;
} }
register_intrinsic("spu_fs", [&](llvm::CallInst* ci) const auto fs = [&](value_t<f32[4]> a, value_t<f32[4]> b)
{ {
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{ {
const auto bc = clamp_smax(b); // for #4478 const auto bc = clamp_smax(b, op.rb); // for #4478
return eval(a - bc); return eval(a - bc);
} }
else else
{ {
return eval(a - b); return eval(a - b);
} }
}); };
set_vr(op.rt, fs(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))); set_vr(op.rt, fs(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
} }
template <typename T, typename U> template <typename T, typename U, typename V = llvm_place_stealer_t<u32>, typename W = llvm_place_stealer_t<u32>>
static llvm_calli<f32[4], T, U> fm(T&& a, U&& b) static auto fm(T&& a, U&& b, V&& a_not_nan = match_stealer<u32>(), W&& b_not_nan = match_stealer<u32>())
{ {
return llvm_calli<f32[4], T, U>{"spu_fm", {std::forward<T>(a), std::forward<U>(b)}}.set_order_equality_hint(1, 1); return llvm_calli<f32[4], T, U, V, W>{"spu_fm", {std::forward<T>(a), std::forward<U>(b), a_not_nan, b_not_nan}}.set_order_equality_hint(1, 1, 2, 3);
} }
void FM(spu_opcode_t op) void FM(spu_opcode_t op)
@ -7249,14 +7241,27 @@ public:
{ {
const auto a = value<f32[4]>(ci->getOperand(0)); const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1)); const auto b = value<f32[4]>(ci->getOperand(1));
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(2))->getZExtValue() != 0;
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{ {
if (a.value == b.value) if (a.value == b.value || (a_notnan && b_notnan))
{ {
return eval(a * b); return eval(a * b);
} }
if (a_notnan)
{
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma));
}
else if (b_notnan)
{
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & mb));
}
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))); const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))); const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma & mb)); return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma & mb));
@ -7267,10 +7272,13 @@ public:
} }
}); });
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
if (op.ra == op.rb && !m_interp_magn) if (op.ra == op.rb && !m_interp_magn)
{ {
const auto a = get_vr<f32[4]>(op.ra); const auto a = get_vr<f32[4]>(op.ra);
set_vr(op.rt, fm(a, a)); set_vr(op.rt, fm(a, a, splat<u32>(a_notnan), splat<u32>(a_notnan)));
return; return;
} }
@ -7309,7 +7317,7 @@ public:
} }
} }
set_vr(op.rt, fm(a, b)); set_vr(op.rt, fm(a, b, splat<u32>(a_notnan), splat<u32>(b_notnan)));
} }
template <typename T> template <typename T>
@ -7602,10 +7610,10 @@ public:
set_vr(op.rt4, fnms(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb), get_vr<f32[4]>(op.rc))); set_vr(op.rt4, fnms(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb), get_vr<f32[4]>(op.rc)));
} }
template <typename T, typename U, typename V> template <typename T, typename U, typename V, typename W = llvm_place_stealer_t<u32>, typename X = llvm_place_stealer_t<u32>>
static llvm_calli<f32[4], T, U, V> fma(T&& a, U&& b, V&& c) static llvm_calli<f32[4], T, U, V, W, X> fma(T&& a, U&& b, V&& c, W&& d = match_stealer<u32>(), X&& e = match_stealer<u32>())
{ {
return llvm_calli<f32[4], T, U, V>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c)}}.set_order_equality_hint(1, 1, 0); return llvm_calli<f32[4], T, U, V, W, X>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c), std::forward<W>(d), std::forward<X>(e)}}.set_order_equality_hint(1, 1, 2, 3, 4);
} }
template <typename T, typename U> template <typename T, typename U>
@ -7624,14 +7632,35 @@ public:
return; return;
} }
register_intrinsic("spu_fma", [&](llvm::CallInst* ci) register_intrinsic("spu_fma", [&](llvm::CallInst* ci)
{ {
const auto a = value<f32[4]>(ci->getOperand(0)); const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1)); const auto b = value<f32[4]>(ci->getOperand(1));
const auto c = value<f32[4]>(ci->getOperand(2)); const auto c = value<f32[4]>(ci->getOperand(2));
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(4))->getZExtValue() != 0;
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{ {
if (a.value == b.value || (a_notnan && b_notnan))
{
return fma32x4(a, b, c);
}
if (a_notnan)
{
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto cb = bitcast<f32[4]>(bitcast<s32[4]>(b) & ma);
return fma32x4(a, eval(cb), c);
}
else if (b_notnan)
{
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
return fma32x4(eval(ca), b, c);
}
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))); const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))); const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb); const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
@ -7680,6 +7709,9 @@ public:
const auto [a, b, c] = get_vrs<f32[4]>(op.ra, op.rb, op.rc); const auto [a, b, c] = get_vrs<f32[4]>(op.ra, op.rb, op.rc);
static const auto MT = match<f32[4]>(); static const auto MT = match<f32[4]>();
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool
{ {
auto match_fnms = [&](f32 float_value) auto match_fnms = [&](f32 float_value)
@ -7875,7 +7907,13 @@ public:
spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos); spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos);
} }
set_vr(op.rt4, fma(a, b, c)); if (!m_interp_magn && op.ra == op.rb)
{
set_vr(op.rt4, fma(a, a, c, splat<u32>(a_notnan), splat<u32>(a_notnan)));
return;
}
set_vr(op.rt4, fma(a, b, c, splat<u32>(a_notnan), splat<u32>(b_notnan)));
} }
template <typename T, typename U, typename V> template <typename T, typename U, typename V>

View File

@ -742,6 +742,9 @@ protected:
// Set if register is used in floating pont instruction // Set if register is used in floating pont instruction
std::bitset<s_reg_max> reg_maybe_float{}; std::bitset<s_reg_max> reg_maybe_float{};
// Set if register is used as shuffle mask
std::bitset<s_reg_max> reg_maybe_shuffle_mask{};
// Number of times registers are used (before modified) // Number of times registers are used (before modified)
std::array<u32, s_reg_max> reg_use{}; std::array<u32, s_reg_max> reg_use{};