From 7e436f9bf136ad00321a97c09fb371fbd4eafe6b Mon Sep 17 00:00:00 2001 From: Malcolm Date: Mon, 18 May 2026 16:13:05 -0400 Subject: [PATCH] SPU LLVM: Optimize SPU multiplies for ARM - Saves 2 instructions in MPY, 1 instruction in MPYU, 2 instructions in MPYS, 2 instructions in MPYA, 1 instruction in MPYI, and 2 instructions in MPYUI --- rpcs3/Emu/CPU/CPUTranslator.h | 24 ++++++++++++++++++++++++ rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index ab2aed8156..bf8c8c4556 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3720,6 +3720,30 @@ template result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_sdot), {data0, data1, data2}); return result; } + + template + value_t smull(T1 a, T2 b) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_smull), {data0, data1}); + return result; + } + + template + value_t umull(T1 a, T2 b) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_umull), {data0, data1}); + return result; + } template auto addp(T1 a, T2 b) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 4140942e2e..1ba19c739b 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -6218,7 +6218,12 @@ public: void MPY(spu_opcode_t op) { +#ifdef ARCH_ARM64 + const auto [a, b] = get_vrs(op.ra, op.rb); + set_vr(op.rt, smull(zshuffle(bitcast(a), 0, 2, 4, 6), zshuffle(bitcast(b), 0, 2, 4, 6))); +#else set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16)); +#endif } void MPYH(spu_opcode_t op) @@ -6233,7 +6238,12 @@ public: void MPYS(spu_opcode_t op) { +#ifdef ARCH_ARM64 + const auto [a, b] = get_vrs(op.ra, op.rb); + set_vr(op.rt, smull(zshuffle(bitcast(a), 0, 2, 4, 6), zshuffle(bitcast(b), 0, 2, 4, 6)) >> 16); +#else set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16) >> 16); +#endif } void CEQH(spu_opcode_t op) @@ -6243,7 +6253,12 @@ public: void MPYU(spu_opcode_t op) { +#ifdef ARCH_ARM64 + const auto [a, b] = get_vrs(op.ra, op.rb); + set_vr(op.rt, umull(zshuffle(bitcast(a), 0, 2, 4, 6), zshuffle(bitcast(b), 0, 2, 4, 6))); +#else set_vr(op.rt, mpyu(get_vr(op.ra), get_vr(op.rb))); +#endif } void CEQB(spu_opcode_t op) @@ -6385,12 +6400,20 @@ public: void MPYI(spu_opcode_t op) { +#ifdef ARCH_ARM64 + set_vr(op.rt, smull(zshuffle(bitcast(get_vr(op.ra)), 0, 2, 4, 6), get_imm(op.si10))); +#else set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * get_imm(op.si10)); +#endif } void MPYUI(spu_opcode_t op) { +#ifdef ARCH_ARM64 + set_vr(op.rt, umull(zshuffle(bitcast(get_vr(op.ra)), 0, 2, 4, 6), get_imm(op.si10))); +#else set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_imm(op.si10) & 0xffff)); +#endif } void CEQI(spu_opcode_t op) @@ -6904,7 +6927,12 @@ public: void MPYA(spu_opcode_t op) { +#ifdef ARCH_ARM64 + const auto [a, b] = get_vrs(op.ra, op.rb); + set_vr(op.rt4, smull(zshuffle(bitcast(a), 0, 2, 4, 6), zshuffle(bitcast(b), 0, 2, 4, 6)) + get_vr(op.rc)); +#else set_vr(op.rt4, (get_vr(op.ra) << 16 >> 16) * (get_vr(op.rb) << 16 >> 16) + get_vr(op.rc)); +#endif } void FSCRRD(spu_opcode_t op) //