mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-06-02 04:36:57 -06:00
SPU LLVM: Optimize SPU multiplies for ARM
- Saves 2 instructions in MPY, 1 instruction in MPYU, 2 instructions in MPYS, 2 instructions in MPYA, 1 instruction in MPYI, and 2 instructions in MPYUI
This commit is contained in:
parent
2613d7eee7
commit
7e436f9bf1
@ -3720,6 +3720,30 @@ template <typename T1, typename T2, typename T3>
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u32[4], u8[16]>(llvm::Intrinsic::aarch64_neon_sdot), {data0, data1, data2});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<s32[4]> smull(T1 a, T2 b)
|
||||
{
|
||||
value_t<s32[4]> result;
|
||||
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto data1 = b.eval(m_ir);
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<s32[4]>(llvm::Intrinsic::aarch64_neon_smull), {data0, data1});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<u32[4]> umull(T1 a, T2 b)
|
||||
{
|
||||
value_t<u32[4]> result;
|
||||
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto data1 = b.eval(m_ir);
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u32[4]>(llvm::Intrinsic::aarch64_neon_umull), {data0, data1});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
auto addp(T1 a, T2 b)
|
||||
|
||||
@ -6218,7 +6218,12 @@ public:
|
||||
|
||||
void MPY(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
const auto [a, b] = get_vrs<s32[4]>(op.ra, op.rb);
|
||||
set_vr(op.rt, smull(zshuffle(bitcast<s16[8]>(a), 0, 2, 4, 6), zshuffle(bitcast<s16[8]>(b), 0, 2, 4, 6)));
|
||||
#else
|
||||
set_vr(op.rt, (get_vr<s32[4]>(op.ra) << 16 >> 16) * (get_vr<s32[4]>(op.rb) << 16 >> 16));
|
||||
#endif
|
||||
}
|
||||
|
||||
void MPYH(spu_opcode_t op)
|
||||
@ -6233,7 +6238,12 @@ public:
|
||||
|
||||
void MPYS(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
const auto [a, b] = get_vrs<s32[4]>(op.ra, op.rb);
|
||||
set_vr(op.rt, smull(zshuffle(bitcast<s16[8]>(a), 0, 2, 4, 6), zshuffle(bitcast<s16[8]>(b), 0, 2, 4, 6)) >> 16);
|
||||
#else
|
||||
set_vr(op.rt, (get_vr<s32[4]>(op.ra) << 16 >> 16) * (get_vr<s32[4]>(op.rb) << 16 >> 16) >> 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
void CEQH(spu_opcode_t op)
|
||||
@ -6243,7 +6253,12 @@ public:
|
||||
|
||||
void MPYU(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
const auto [a, b] = get_vrs<u32[4]>(op.ra, op.rb);
|
||||
set_vr(op.rt, umull(zshuffle(bitcast<u16[8]>(a), 0, 2, 4, 6), zshuffle(bitcast<u16[8]>(b), 0, 2, 4, 6)));
|
||||
#else
|
||||
set_vr(op.rt, mpyu(get_vr(op.ra), get_vr(op.rb)));
|
||||
#endif
|
||||
}
|
||||
|
||||
void CEQB(spu_opcode_t op)
|
||||
@ -6385,12 +6400,20 @@ public:
|
||||
|
||||
void MPYI(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
set_vr(op.rt, smull(zshuffle(bitcast<s16[8]>(get_vr<s32[4]>(op.ra)), 0, 2, 4, 6), get_imm<s16[4]>(op.si10)));
|
||||
#else
|
||||
set_vr(op.rt, (get_vr<s32[4]>(op.ra) << 16 >> 16) * get_imm<s32[4]>(op.si10));
|
||||
#endif
|
||||
}
|
||||
|
||||
void MPYUI(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
set_vr(op.rt, umull(zshuffle(bitcast<u16[8]>(get_vr<u32[4]>(op.ra)), 0, 2, 4, 6), get_imm<u16[4]>(op.si10)));
|
||||
#else
|
||||
set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_imm(op.si10) & 0xffff));
|
||||
#endif
|
||||
}
|
||||
|
||||
void CEQI(spu_opcode_t op)
|
||||
@ -6904,7 +6927,12 @@ public:
|
||||
|
||||
void MPYA(spu_opcode_t op)
|
||||
{
|
||||
#ifdef ARCH_ARM64
|
||||
const auto [a, b] = get_vrs<s32[4]>(op.ra, op.rb);
|
||||
set_vr(op.rt4, smull(zshuffle(bitcast<s16[8]>(a), 0, 2, 4, 6), zshuffle(bitcast<s16[8]>(b), 0, 2, 4, 6)) + get_vr<s32[4]>(op.rc));
|
||||
#else
|
||||
set_vr(op.rt4, (get_vr<s32[4]>(op.ra) << 16 >> 16) * (get_vr<s32[4]>(op.rb) << 16 >> 16) + get_vr<s32[4]>(op.rc));
|
||||
#endif
|
||||
}
|
||||
|
||||
void FSCRRD(spu_opcode_t op) //
|
||||
|
||||
Loading…
Reference in New Issue
Block a user