iR5900: Faster FTOI

This commit is contained in:
TellowKrinkle 2025-05-23 02:02:58 -05:00 committed by lightningterror
parent 5bc2342d47
commit c72e894fc7
5 changed files with 22 additions and 77 deletions

View File

@ -249,7 +249,6 @@ void CTC1() {
void CVT_S() {
_FdValf_ = (float)_FsValSl_;
_FdValf_ = fpuDouble( _FdValUl_ );
}
void CVT_W() {

View File

@ -26,8 +26,6 @@ namespace DOUBLE
void recC_EQ_xmm(int info);
void recC_LE_xmm(int info);
void recC_LT_xmm(int info);
void recCVT_S_xmm(int info);
void recCVT_W();
void recDIV_S_xmm(int info);
void recMADD_S_xmm(int info);
void recMADDA_S_xmm(int info);
@ -993,15 +991,16 @@ void recCVT_S_xmm(int info)
}
}
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
void recCVT_S()
{
// Float version is fully accurate, no double version
eeFPURecompileCode(recCVT_S_xmm, R5900::Interpreter::OpcodeImpl::COP1::CVT_S, XMMINFO_WRITED | XMMINFO_READS);
}
void recCVT_W()
{
if (CHECK_FPU_FULL)
{
DOUBLE::recCVT_W();
return;
}
// Float version is fully accurate, no double version
// If we have the following EmitOP() on the top then it'll get calculated twice when CHECK_FPU_FULL is true
// as we also have an EmitOP() at recCVT_W() on iFPUd.cpp. hence we have it below the possible return.
EE::Profiler.EmitOp(eeOpcode::CVTW);
@ -1010,26 +1009,23 @@ void recCVT_W()
if (regs >= 0)
{
if (CHECK_FPU_EXTRA_OVERFLOW)
fpuFloat2(regs);
xCVTTSS2SI(eax, xRegisterSSE(regs));
xMOVMSKPS(edx, xRegisterSSE(regs)); //extract the signs
xAND(edx, 1); // keep only LSB
xMOVD(edx, xRegisterSSE(regs));
}
else
{
xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
xSHR(edx, 31); // mov sign to lsb
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
xCMP(eax, 0x80000000); // If the result is indefinitive
xCMOVE(eax, edx); // Saturate it
// cvttss2si converts unrepresentable values to 0x80000000, so negative values are already handled.
// So we just need to handle positive values.
xCMP(edx, 0x4f000000); // If the input is greater than INT_MAX
xMOV(edx, 0x7fffffff);
xCMOVGE(eax, edx); // Saturate it
//Write the result
xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);

View File

@ -540,57 +540,10 @@ FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
// CVT.x XMM
//------------------------------------------------------------------
void recCVT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CVTS_F);
if (info & PROCESS_EE_D)
{
if (info & PROCESS_EE_S)
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
}
else
{
const int temp = _allocTempXMMreg(XMMT_FPS);
xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
_freeXMMreg(temp);
}
}
// CVT.S: Identical to non-double variant, omitted
// CVT.W: Identical to non-double variant, omitted
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
void recCVT_W() //called from iFPU.cpp's recCVT_W
{
EE::Profiler.EmitOp(eeOpcode::CVTW);
int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
if (regs >= 0)
{
xCVTTSS2SI(eax, xRegisterSSE(regs));
xMOVMSKPS(edx, xRegisterSSE(regs)); // extract the signs
xAND(edx, 1); // keep only LSB
}
else
{
xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
xSHR(edx, 31); //mov sign to lsb
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
xCMP(eax, 0x80000000); // If the result is indefinitive
xCMOVE(eax, edx); // Saturate it
//Write the result
xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);
}
//------------------------------------------------------------------

View File

@ -42,6 +42,7 @@ struct mVU_Globals
u32 E4 [4] = __four(0x3933e553);
u32 E5 [4] = __four(0x36b63510);
u32 E6 [4] = __four(0x353961ac);
u32 I32MAXF [4] = __four(0x4effffff);
float FTOI_4 [4] = __four(16.0);
float FTOI_12 [4] = __four(4096.0);
float FTOI_15 [4] = __four(32768.0);

View File

@ -484,23 +484,19 @@ static void mVU_FTOIx(mP, const float* addr, microOpcode opEnum)
return;
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W, !((_Fs_ == _Ft_) && (_X_Y_Z_W == 0xf)));
const xmm& t1 = mVU.regAlloc->allocReg();
const xmm& t2 = mVU.regAlloc->allocReg();
// Note: For help understanding this algorithm see recVUMI_FTOI_Saturate()
xMOVAPS(t1, Fs);
// cvttps2dq returns 0x8000000 for any unrepresentable values.
// We want it to return 0x8000000 for negative and 0x7fffffff for positive.
// So for unrepresentable positive values, xor with 0xffffffff to turn 0x80000000 into 0x7fffffff.
if (addr)
xMUL.PS(Fs, ptr128[addr]);
xMOVAPS(t1, Fs);
xPCMP.GTD(t1, ptr128[mVUglob.I32MAXF]);
xCVTTPS2DQ(Fs, Fs);
xPXOR(t1, ptr128[mVUglob.signbit]);
xPSRA.D(t1, 31);
xMOVAPS(t2, Fs);
xPCMP.EQD(t2, ptr128[mVUglob.signbit]);
xAND.PS(t1, t2);
xPADD.D(Fs, t1);
xPXOR(Fs, t1);
mVU.regAlloc->clearNeeded(Fs);
mVU.regAlloc->clearNeeded(t1);
mVU.regAlloc->clearNeeded(t2);
mVU.profiler.EmitOp(opEnum);
}
pass3