iR5900: Faster FTOI

2025-12-16 04:08:48 +00:00 · 2025-05-23 02:02:58 -05:00 · 2025-05-23 02:02:58 -05:00 · c72e894fc7
commit c72e894fc7
parent 5bc2342d47
5 changed files with 22 additions and 77 deletions
--- a/pcsx2/FPU.cpp
+++ b/pcsx2/FPU.cpp
@ -249,7 +249,6 @@ void CTC1() {
 void CVT_S() {
 	_FdValf_ = (float)_FsValSl_;
 	_FdValf_ = fpuDouble( _FdValUl_ );
 }
 void CVT_W() {
--- a/pcsx2/x86/iFPU.cpp
+++ b/pcsx2/x86/iFPU.cpp
@ -26,8 +26,6 @@ namespace DOUBLE
 	void recC_EQ_xmm(int info);
 	void recC_LE_xmm(int info);
 	void recC_LT_xmm(int info);
 	void recCVT_S_xmm(int info);
 	void recCVT_W();
 	void recDIV_S_xmm(int info);
 	void recMADD_S_xmm(int info);
 	void recMADDA_S_xmm(int info);
@ -993,15 +991,16 @@ void recCVT_S_xmm(int info)
 	}
 }
-FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
+void recCVT_S()
 {
 	// Float version is fully accurate, no double version
 	eeFPURecompileCode(recCVT_S_xmm, R5900::Interpreter::OpcodeImpl::COP1::CVT_S, XMMINFO_WRITED | XMMINFO_READS);
 }
 void recCVT_W()
 {
-	if (CHECK_FPU_FULL)
+	// Float version is fully accurate, no double version
-	{
+
 		DOUBLE::recCVT_W();
 		return;
 	}
 	// If we have the following EmitOP() on the top then it'll get calculated twice when CHECK_FPU_FULL is true
 	// as we also have an EmitOP() at recCVT_W() on iFPUd.cpp.  hence we have it below the possible return.
 	EE::Profiler.EmitOp(eeOpcode::CVTW);
@ -1010,26 +1009,23 @@ void recCVT_W()
 	if (regs >= 0)
 	{
 		if (CHECK_FPU_EXTRA_OVERFLOW)
 			fpuFloat2(regs);
 		xCVTTSS2SI(eax, xRegisterSSE(regs));
-		xMOVMSKPS(edx, xRegisterSSE(regs)); //extract the signs
+		xMOVD(edx, xRegisterSSE(regs));
 		xAND(edx, 1); // keep only LSB
 	}
 	else
 	{
 		xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
 		xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
 		xSHR(edx, 31); // mov sign to lsb
 	}
 	//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
 	_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
-	xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
+	// cvttss2si converts unrepresentable values to 0x80000000, so negative values are already handled.
-
+	// So we just need to handle positive values.
-	xCMP(eax, 0x80000000); // If the result is indefinitive
+	xCMP(edx, 0x4f000000); // If the input is greater than INT_MAX
-	xCMOVE(eax, edx);      // Saturate it
+	xMOV(edx, 0x7fffffff);
 	xCMOVGE(eax, edx);     // Saturate it
 	//Write the result
 	xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);
--- a/pcsx2/x86/iFPUd.cpp
+++ b/pcsx2/x86/iFPUd.cpp
@ -540,57 +540,10 @@ FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
 //------------------------------------------------------------------
 // CVT.x XMM
 //------------------------------------------------------------------
 void recCVT_S_xmm(int info)
 {
 	EE::Profiler.EmitOp(eeOpcode::CVTS_F);
-	if (info & PROCESS_EE_D)
+// CVT.S: Identical to non-double variant, omitted
-	{
+// CVT.W: Identical to non-double variant, omitted
 		if (info & PROCESS_EE_S)
 			xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
 		else
 			xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
 	}
 	else
 	{
 		const int temp = _allocTempXMMreg(XMMT_FPS);
 		xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
 		xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
 		_freeXMMreg(temp);
 	}
 }
 FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
 void recCVT_W() //called from iFPU.cpp's recCVT_W
 {
 	EE::Profiler.EmitOp(eeOpcode::CVTW);
 	int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
 	if (regs >= 0)
 	{
 		xCVTTSS2SI(eax, xRegisterSSE(regs));
 		xMOVMSKPS(edx, xRegisterSSE(regs)); // extract the signs
 		xAND(edx, 1);                       // keep only LSB
 	}
 	else
 	{
 		xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
 		xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
 		xSHR(edx, 31); //mov sign to lsb
 	}
 	//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
 	_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
 	xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
 	xCMP(eax, 0x80000000); // If the result is indefinitive
 	xCMOVE(eax, edx);      // Saturate it
 	//Write the result
 	xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);
 }
 //------------------------------------------------------------------
--- a/pcsx2/x86/microVU_Misc.h
+++ b/pcsx2/x86/microVU_Misc.h
@ -42,6 +42,7 @@ struct mVU_Globals
 	u32   E4      [4] = __four(0x3933e553);
 	u32   E5      [4] = __four(0x36b63510);
 	u32   E6      [4] = __four(0x353961ac);
 	u32   I32MAXF [4] = __four(0x4effffff);
 	float FTOI_4  [4] = __four(16.0);
 	float FTOI_12 [4] = __four(4096.0);
 	float FTOI_15 [4] = __four(32768.0);
--- a/pcsx2/x86/microVU_Upper.inl
+++ b/pcsx2/x86/microVU_Upper.inl
@ -484,23 +484,19 @@ static void mVU_FTOIx(mP, const float* addr, microOpcode opEnum)
 			return;
 		const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
 		const xmm& t1 = mVU.regAlloc->allocReg();
 		const xmm& t2 = mVU.regAlloc->allocReg();
-		// Note: For help understanding this algorithm see recVUMI_FTOI_Saturate()
+		// cvttps2dq returns 0x8000000 for any unrepresentable values.
-		xMOVAPS(t1, Fs);
+		// We want it to return 0x8000000 for negative and 0x7fffffff for positive.
 		// So for unrepresentable positive values, xor with 0xffffffff to turn 0x80000000 into 0x7fffffff.
 		if (addr)
 			xMUL.PS(Fs, ptr128[addr]);
 		xMOVAPS(t1, Fs);
 		xPCMP.GTD(t1, ptr128[mVUglob.I32MAXF]);
 		xCVTTPS2DQ(Fs, Fs);
-		xPXOR(t1, ptr128[mVUglob.signbit]);
+		xPXOR(Fs, t1);
 		xPSRA.D(t1, 31);
 		xMOVAPS(t2, Fs);
 		xPCMP.EQD(t2, ptr128[mVUglob.signbit]);
 		xAND.PS(t1, t2);
 		xPADD.D(Fs, t1);
 		mVU.regAlloc->clearNeeded(Fs);
 		mVU.regAlloc->clearNeeded(t1);
 		mVU.regAlloc->clearNeeded(t2);
 		mVU.profiler.EmitOp(opEnum);
 	}
 	pass3