SPU LLVM: Optimize FM, FMA and FCGT in Reduced Loop

This commit is contained in:
Elad 2026-03-08 15:36:58 +02:00
parent 4d2537a6f4
commit eb4c4d3d22
4 changed files with 124 additions and 46 deletions

View File

@ -567,6 +567,32 @@ struct llvm_placeholder_t
}
};
template <typename T, typename U = llvm_common_t<llvm_value_t<T>>>
struct llvm_place_stealer_t
{
// TODO: placeholder extracting actual constant values (u64, f64, vector, etc)
using type = T;
static constexpr bool is_ok = true;
llvm::Value* eval(llvm::IRBuilder<>*) const
{
return nullptr;
}
std::tuple<> match(llvm::Value*& value, llvm::Module*) const
{
if (value && value->getType() == llvm_value_t<T>::get_type(value->getContext()))
{
return {};
}
value = nullptr;
return {};
}
};
template <typename T, bool ForceSigned = false>
struct llvm_const_int
{
@ -3227,6 +3253,12 @@ public:
return {};
}
template <typename T>
static llvm_place_stealer_t<T> match_stealer()
{
return {};
}
template <typename T>
requires requires { typename llvm_common_t<T>; }
static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr)

View File

@ -3958,6 +3958,11 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
block.reg_maybe_float.set(reg);
}
if (type == spu_itype::SHUFB && reg == op.rc)
{
block.reg_maybe_shuffle_mask.set(reg);
}
block.reg_use[reg]++;
if (reg_save != reg && block.reg_save_dom[reg])

View File

@ -175,6 +175,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
const usz first_id = store_context_first_id[i];
return counter != 1 && first_id != umax && counter < first_id;
}
bool is_gpr_not_NaN_hint(u32 i) const noexcept
{
return block_wide_reg_store_elimination && bb->reg_maybe_float[i] && bb->reg_use[i] >= 3 && !bb->reg_mod[i];
}
};
struct function_info
@ -1692,7 +1697,7 @@ public:
// Emit state check
const auto pstate = spu_ptr(&spu_thread::state);
m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
m_ir->CreateCondBr(m_ir->CreateICmpNE(spu_context_attr(m_ir->CreateLoad(get_type<u32>(), pstate)), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
// Emit code check
u32 check_iterations = 0;
@ -6915,8 +6920,13 @@ public:
return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
}
value_t<f32[4]> clamp_smax(value_t<f32[4]> v)
value_t<f32[4]> clamp_smax(value_t<f32[4]> v, u32 gpr = s_reg_max)
{
if (m_block && gpr < s_reg_max && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(gpr))
{
return v;
}
if (m_use_avx512)
{
if (is_input_positive(v))
@ -6936,16 +6946,6 @@ public:
return eval(clamp_positive_smax(clamp_negative_smax(v)));
}
// FMA favouring zeros
value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
{
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
return eval(fmuladd(ca, cb, c));
}
// Checks for postive and negative zero, or Denormal (treated as zero)
// If sign is +-1 check equality againts all sign bits
bool is_spu_float_zero(v128 a, int sign = 0)
@ -7032,12 +7032,6 @@ public:
set_vr(op.rt, frsqest(get_vr<f32[4]>(op.ra)));
}
template <typename T, typename U>
static llvm_calli<s32[4], T, U> fcgt(T&& a, U&& b)
{
return {"spu_fcgt", {std::forward<T>(a), std::forward<U>(b)}};
}
void FCGT(spu_opcode_t op)
{
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
@ -7046,11 +7040,8 @@ public:
return;
}
register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci)
const auto fcgt = [&](value_t<f32[4]> a, value_t<f32[4]> b)
{
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
const value_t<f32[4]> ab[2]{a, b};
std::bitset<2> safe_int_compare(0);
@ -7082,6 +7073,16 @@ public:
}
}
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra))
{
safe_finite_compare.set(0);
}
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb))
{
safe_finite_compare.set(1);
}
if (safe_int_compare.any())
{
return eval(sext<s32[4]>(bitcast<s32[4]>(a) > bitcast<s32[4]>(b)));
@ -7101,7 +7102,7 @@ public:
const auto bi = eval(bitcast<s32[4]>(b));
return eval(sext<s32[4]>(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi)));
});
};
set_vr(op.rt, fcgt(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
}
@ -7198,12 +7199,6 @@ public:
set_vr(op.rt, fa(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
}
template <typename T, typename U>
static llvm_calli<f32[4], T, U> fs(T&& a, U&& b)
{
return {"spu_fs", {std::forward<T>(a), std::forward<U>(b)}};
}
void FS(spu_opcode_t op)
{
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
@ -7212,29 +7207,26 @@ public:
return;
}
register_intrinsic("spu_fs", [&](llvm::CallInst* ci)
const auto fs = [&](value_t<f32[4]> a, value_t<f32[4]> b)
{
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{
const auto bc = clamp_smax(b); // for #4478
const auto bc = clamp_smax(b, op.rb); // for #4478
return eval(a - bc);
}
else
{
return eval(a - b);
}
});
};
set_vr(op.rt, fs(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
}
template <typename T, typename U>
static llvm_calli<f32[4], T, U> fm(T&& a, U&& b)
template <typename T, typename U, typename V = llvm_place_stealer_t<u32>, typename W = llvm_place_stealer_t<u32>>
static auto fm(T&& a, U&& b, V&& a_not_nan = match_stealer<u32>(), W&& b_not_nan = match_stealer<u32>())
{
return llvm_calli<f32[4], T, U>{"spu_fm", {std::forward<T>(a), std::forward<U>(b)}}.set_order_equality_hint(1, 1);
return llvm_calli<f32[4], T, U, V, W>{"spu_fm", {std::forward<T>(a), std::forward<U>(b), a_not_nan, b_not_nan}}.set_order_equality_hint(1, 1, 2, 3);
}
void FM(spu_opcode_t op)
@ -7249,14 +7241,27 @@ public:
{
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(2))->getZExtValue() != 0;
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{
if (a.value == b.value)
if (a.value == b.value || (a_notnan && b_notnan))
{
return eval(a * b);
}
if (a_notnan)
{
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma));
}
else if (b_notnan)
{
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & mb));
}
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma & mb));
@ -7267,10 +7272,13 @@ public:
}
});
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
if (op.ra == op.rb && !m_interp_magn)
{
const auto a = get_vr<f32[4]>(op.ra);
set_vr(op.rt, fm(a, a));
set_vr(op.rt, fm(a, a, splat<u32>(a_notnan), splat<u32>(a_notnan)));
return;
}
@ -7309,7 +7317,7 @@ public:
}
}
set_vr(op.rt, fm(a, b));
set_vr(op.rt, fm(a, b, splat<u32>(a_notnan), splat<u32>(b_notnan)));
}
template <typename T>
@ -7602,10 +7610,10 @@ public:
set_vr(op.rt4, fnms(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb), get_vr<f32[4]>(op.rc)));
}
template <typename T, typename U, typename V>
static llvm_calli<f32[4], T, U, V> fma(T&& a, U&& b, V&& c)
template <typename T, typename U, typename V, typename W = llvm_place_stealer_t<u32>, typename X = llvm_place_stealer_t<u32>>
static llvm_calli<f32[4], T, U, V, W, X> fma(T&& a, U&& b, V&& c, W&& d = match_stealer<u32>(), X&& e = match_stealer<u32>())
{
return llvm_calli<f32[4], T, U, V>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c)}}.set_order_equality_hint(1, 1, 0);
return llvm_calli<f32[4], T, U, V, W, X>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c), std::forward<W>(d), std::forward<X>(e)}}.set_order_equality_hint(1, 1, 2, 3, 4);
}
template <typename T, typename U>
@ -7624,14 +7632,35 @@ public:
return;
}
register_intrinsic("spu_fma", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));
const auto b = value<f32[4]>(ci->getOperand(1));
const auto c = value<f32[4]>(ci->getOperand(2));
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(4))->getZExtValue() != 0;
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
{
if (a.value == b.value || (a_notnan && b_notnan))
{
return fma32x4(a, b, c);
}
if (a_notnan)
{
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto cb = bitcast<f32[4]>(bitcast<s32[4]>(b) & ma);
return fma32x4(a, eval(cb), c);
}
else if (b_notnan)
{
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
return fma32x4(eval(ca), b, c);
}
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
@ -7680,6 +7709,9 @@ public:
const auto [a, b, c] = get_vrs<f32[4]>(op.ra, op.rb, op.rc);
static const auto MT = match<f32[4]>();
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool
{
auto match_fnms = [&](f32 float_value)
@ -7875,7 +7907,13 @@ public:
spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos);
}
set_vr(op.rt4, fma(a, b, c));
if (!m_interp_magn && op.ra == op.rb)
{
set_vr(op.rt4, fma(a, a, c, splat<u32>(a_notnan), splat<u32>(a_notnan)));
return;
}
set_vr(op.rt4, fma(a, b, c, splat<u32>(a_notnan), splat<u32>(b_notnan)));
}
template <typename T, typename U, typename V>

View File

@ -742,6 +742,9 @@ protected:
// Set if register is used in floating pont instruction
std::bitset<s_reg_max> reg_maybe_float{};
// Set if register is used as shuffle mask
std::bitset<s_reg_max> reg_maybe_shuffle_mask{};
// Number of times registers are used (before modified)
std::array<u32, s_reg_max> reg_use{};