mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-03-26 20:38:33 -06:00
SPU LLVM: Optimize FM, FMA and FCGT in Reduced Loop
This commit is contained in:
parent
4d2537a6f4
commit
eb4c4d3d22
@ -567,6 +567,32 @@ struct llvm_placeholder_t
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename U = llvm_common_t<llvm_value_t<T>>>
|
||||
struct llvm_place_stealer_t
|
||||
{
|
||||
// TODO: placeholder extracting actual constant values (u64, f64, vector, etc)
|
||||
|
||||
using type = T;
|
||||
|
||||
static constexpr bool is_ok = true;
|
||||
|
||||
llvm::Value* eval(llvm::IRBuilder<>*) const
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::tuple<> match(llvm::Value*& value, llvm::Module*) const
|
||||
{
|
||||
if (value && value->getType() == llvm_value_t<T>::get_type(value->getContext()))
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
value = nullptr;
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool ForceSigned = false>
|
||||
struct llvm_const_int
|
||||
{
|
||||
@ -3227,6 +3253,12 @@ public:
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static llvm_place_stealer_t<T> match_stealer()
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
requires requires { typename llvm_common_t<T>; }
|
||||
static auto match_expr(llvm::Value* v, llvm::Module* _m, T&& expr)
|
||||
|
||||
@ -3958,6 +3958,11 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
|
||||
block.reg_maybe_float.set(reg);
|
||||
}
|
||||
|
||||
if (type == spu_itype::SHUFB && reg == op.rc)
|
||||
{
|
||||
block.reg_maybe_shuffle_mask.set(reg);
|
||||
}
|
||||
|
||||
block.reg_use[reg]++;
|
||||
|
||||
if (reg_save != reg && block.reg_save_dom[reg])
|
||||
|
||||
@ -175,6 +175,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
||||
const usz first_id = store_context_first_id[i];
|
||||
return counter != 1 && first_id != umax && counter < first_id;
|
||||
}
|
||||
|
||||
bool is_gpr_not_NaN_hint(u32 i) const noexcept
|
||||
{
|
||||
return block_wide_reg_store_elimination && bb->reg_maybe_float[i] && bb->reg_use[i] >= 3 && !bb->reg_mod[i];
|
||||
}
|
||||
};
|
||||
|
||||
struct function_info
|
||||
@ -1692,7 +1697,7 @@ public:
|
||||
|
||||
// Emit state check
|
||||
const auto pstate = spu_ptr(&spu_thread::state);
|
||||
m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
|
||||
m_ir->CreateCondBr(m_ir->CreateICmpNE(spu_context_attr(m_ir->CreateLoad(get_type<u32>(), pstate)), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
|
||||
|
||||
// Emit code check
|
||||
u32 check_iterations = 0;
|
||||
@ -6915,8 +6920,13 @@ public:
|
||||
return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
|
||||
}
|
||||
|
||||
value_t<f32[4]> clamp_smax(value_t<f32[4]> v)
|
||||
value_t<f32[4]> clamp_smax(value_t<f32[4]> v, u32 gpr = s_reg_max)
|
||||
{
|
||||
if (m_block && gpr < s_reg_max && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(gpr))
|
||||
{
|
||||
return v;
|
||||
}
|
||||
|
||||
if (m_use_avx512)
|
||||
{
|
||||
if (is_input_positive(v))
|
||||
@ -6936,16 +6946,6 @@ public:
|
||||
return eval(clamp_positive_smax(clamp_negative_smax(v)));
|
||||
}
|
||||
|
||||
// FMA favouring zeros
|
||||
value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
|
||||
{
|
||||
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
|
||||
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
|
||||
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
|
||||
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
|
||||
return eval(fmuladd(ca, cb, c));
|
||||
}
|
||||
|
||||
// Checks for postive and negative zero, or Denormal (treated as zero)
|
||||
// If sign is +-1 check equality againts all sign bits
|
||||
bool is_spu_float_zero(v128 a, int sign = 0)
|
||||
@ -7032,12 +7032,6 @@ public:
|
||||
set_vr(op.rt, frsqest(get_vr<f32[4]>(op.ra)));
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static llvm_calli<s32[4], T, U> fcgt(T&& a, U&& b)
|
||||
{
|
||||
return {"spu_fcgt", {std::forward<T>(a), std::forward<U>(b)}};
|
||||
}
|
||||
|
||||
void FCGT(spu_opcode_t op)
|
||||
{
|
||||
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
|
||||
@ -7046,11 +7040,8 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci)
|
||||
const auto fcgt = [&](value_t<f32[4]> a, value_t<f32[4]> b)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto b = value<f32[4]>(ci->getOperand(1));
|
||||
|
||||
const value_t<f32[4]> ab[2]{a, b};
|
||||
|
||||
std::bitset<2> safe_int_compare(0);
|
||||
@ -7082,6 +7073,16 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra))
|
||||
{
|
||||
safe_finite_compare.set(0);
|
||||
}
|
||||
|
||||
if (m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb))
|
||||
{
|
||||
safe_finite_compare.set(1);
|
||||
}
|
||||
|
||||
if (safe_int_compare.any())
|
||||
{
|
||||
return eval(sext<s32[4]>(bitcast<s32[4]>(a) > bitcast<s32[4]>(b)));
|
||||
@ -7101,7 +7102,7 @@ public:
|
||||
const auto bi = eval(bitcast<s32[4]>(b));
|
||||
|
||||
return eval(sext<s32[4]>(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi)));
|
||||
});
|
||||
};
|
||||
|
||||
set_vr(op.rt, fcgt(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
|
||||
}
|
||||
@ -7198,12 +7199,6 @@ public:
|
||||
set_vr(op.rt, fa(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static llvm_calli<f32[4], T, U> fs(T&& a, U&& b)
|
||||
{
|
||||
return {"spu_fs", {std::forward<T>(a), std::forward<U>(b)}};
|
||||
}
|
||||
|
||||
void FS(spu_opcode_t op)
|
||||
{
|
||||
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate)
|
||||
@ -7212,29 +7207,26 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
register_intrinsic("spu_fs", [&](llvm::CallInst* ci)
|
||||
const auto fs = [&](value_t<f32[4]> a, value_t<f32[4]> b)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto b = value<f32[4]>(ci->getOperand(1));
|
||||
|
||||
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
|
||||
{
|
||||
const auto bc = clamp_smax(b); // for #4478
|
||||
const auto bc = clamp_smax(b, op.rb); // for #4478
|
||||
return eval(a - bc);
|
||||
}
|
||||
else
|
||||
{
|
||||
return eval(a - b);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
set_vr(op.rt, fs(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static llvm_calli<f32[4], T, U> fm(T&& a, U&& b)
|
||||
template <typename T, typename U, typename V = llvm_place_stealer_t<u32>, typename W = llvm_place_stealer_t<u32>>
|
||||
static auto fm(T&& a, U&& b, V&& a_not_nan = match_stealer<u32>(), W&& b_not_nan = match_stealer<u32>())
|
||||
{
|
||||
return llvm_calli<f32[4], T, U>{"spu_fm", {std::forward<T>(a), std::forward<U>(b)}}.set_order_equality_hint(1, 1);
|
||||
return llvm_calli<f32[4], T, U, V, W>{"spu_fm", {std::forward<T>(a), std::forward<U>(b), a_not_nan, b_not_nan}}.set_order_equality_hint(1, 1, 2, 3);
|
||||
}
|
||||
|
||||
void FM(spu_opcode_t op)
|
||||
@ -7249,14 +7241,27 @@ public:
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto b = value<f32[4]>(ci->getOperand(1));
|
||||
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(2))->getZExtValue() != 0;
|
||||
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
|
||||
|
||||
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
|
||||
{
|
||||
if (a.value == b.value)
|
||||
if (a.value == b.value || (a_notnan && b_notnan))
|
||||
{
|
||||
return eval(a * b);
|
||||
}
|
||||
|
||||
if (a_notnan)
|
||||
{
|
||||
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
|
||||
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma));
|
||||
}
|
||||
else if (b_notnan)
|
||||
{
|
||||
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
|
||||
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & mb));
|
||||
}
|
||||
|
||||
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
|
||||
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
|
||||
return eval(bitcast<f32[4]>(bitcast<s32[4]>(a * b) & ma & mb));
|
||||
@ -7267,10 +7272,13 @@ public:
|
||||
}
|
||||
});
|
||||
|
||||
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
|
||||
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
|
||||
|
||||
if (op.ra == op.rb && !m_interp_magn)
|
||||
{
|
||||
const auto a = get_vr<f32[4]>(op.ra);
|
||||
set_vr(op.rt, fm(a, a));
|
||||
set_vr(op.rt, fm(a, a, splat<u32>(a_notnan), splat<u32>(a_notnan)));
|
||||
return;
|
||||
}
|
||||
|
||||
@ -7309,7 +7317,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
set_vr(op.rt, fm(a, b));
|
||||
set_vr(op.rt, fm(a, b, splat<u32>(a_notnan), splat<u32>(b_notnan)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -7602,10 +7610,10 @@ public:
|
||||
set_vr(op.rt4, fnms(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb), get_vr<f32[4]>(op.rc)));
|
||||
}
|
||||
|
||||
template <typename T, typename U, typename V>
|
||||
static llvm_calli<f32[4], T, U, V> fma(T&& a, U&& b, V&& c)
|
||||
template <typename T, typename U, typename V, typename W = llvm_place_stealer_t<u32>, typename X = llvm_place_stealer_t<u32>>
|
||||
static llvm_calli<f32[4], T, U, V, W, X> fma(T&& a, U&& b, V&& c, W&& d = match_stealer<u32>(), X&& e = match_stealer<u32>())
|
||||
{
|
||||
return llvm_calli<f32[4], T, U, V>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c)}}.set_order_equality_hint(1, 1, 0);
|
||||
return llvm_calli<f32[4], T, U, V, W, X>{"spu_fma", {std::forward<T>(a), std::forward<U>(b), std::forward<V>(c), std::forward<W>(d), std::forward<X>(e)}}.set_order_equality_hint(1, 1, 2, 3, 4);
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
@ -7624,14 +7632,35 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
register_intrinsic("spu_fma", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<f32[4]>(ci->getOperand(0));
|
||||
const auto b = value<f32[4]>(ci->getOperand(1));
|
||||
const auto c = value<f32[4]>(ci->getOperand(2));
|
||||
|
||||
const bool a_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(3))->getZExtValue() != 0;
|
||||
const bool b_notnan = llvm::cast<llvm::ConstantInt>(ci->getOperand(4))->getZExtValue() != 0;
|
||||
|
||||
if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate)
|
||||
{
|
||||
if (a.value == b.value || (a_notnan && b_notnan))
|
||||
{
|
||||
return fma32x4(a, b, c);
|
||||
}
|
||||
|
||||
if (a_notnan)
|
||||
{
|
||||
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
|
||||
const auto cb = bitcast<f32[4]>(bitcast<s32[4]>(b) & ma);
|
||||
return fma32x4(a, eval(cb), c);
|
||||
}
|
||||
else if (b_notnan)
|
||||
{
|
||||
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
|
||||
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
|
||||
return fma32x4(eval(ca), b, c);
|
||||
}
|
||||
|
||||
const auto ma = sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.)));
|
||||
const auto mb = sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.)));
|
||||
const auto ca = bitcast<f32[4]>(bitcast<s32[4]>(a) & mb);
|
||||
@ -7680,6 +7709,9 @@ public:
|
||||
const auto [a, b, c] = get_vrs<f32[4]>(op.ra, op.rb, op.rc);
|
||||
static const auto MT = match<f32[4]>();
|
||||
|
||||
const u32 a_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.ra) ? 1 : 0;
|
||||
const u32 b_notnan = m_block && m_block->block_wide_reg_store_elimination && m_block->is_gpr_not_NaN_hint(op.rb) ? 1 : 0;
|
||||
|
||||
auto check_sqrt_pattern_for_float = [&](f32 float_value) -> bool
|
||||
{
|
||||
auto match_fnms = [&](f32 float_value)
|
||||
@ -7875,7 +7907,13 @@ public:
|
||||
spu_log.todo("[%s:0x%05x] Unmatched spu_rsqrte(c) found in FMA", m_hash, m_pos);
|
||||
}
|
||||
|
||||
set_vr(op.rt4, fma(a, b, c));
|
||||
if (!m_interp_magn && op.ra == op.rb)
|
||||
{
|
||||
set_vr(op.rt4, fma(a, a, c, splat<u32>(a_notnan), splat<u32>(a_notnan)));
|
||||
return;
|
||||
}
|
||||
|
||||
set_vr(op.rt4, fma(a, b, c, splat<u32>(a_notnan), splat<u32>(b_notnan)));
|
||||
}
|
||||
|
||||
template <typename T, typename U, typename V>
|
||||
|
||||
@ -742,6 +742,9 @@ protected:
|
||||
// Set if register is used in floating pont instruction
|
||||
std::bitset<s_reg_max> reg_maybe_float{};
|
||||
|
||||
// Set if register is used as shuffle mask
|
||||
std::bitset<s_reg_max> reg_maybe_shuffle_mask{};
|
||||
|
||||
// Number of times registers are used (before modified)
|
||||
std::array<u32, s_reg_max> reg_use{};
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user