diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 1f172537b3..2d787e3893 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -1180,6 +1180,13 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) IntializeSpeculativeConstants(); } + BitSet32 previous_op_gpr_will_be_written = code_block.m_gpr_outputs; + BitSet32 previous_op_gpr_will_be_used = code_block.m_gpr_inputs | previous_op_gpr_will_be_written; + BitSet32 previous_op_fpr_will_be_written = code_block.m_fpr_outputs; + BitSet32 previous_op_fpr_will_be_used = code_block.m_fpr_inputs | previous_op_fpr_will_be_written; + BitSet8 previous_op_cr_will_be_written = code_block.m_cr_outputs; + BitSet8 previous_op_cr_will_be_used = code_block.m_cr_inputs | previous_op_cr_will_be_written; + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { @@ -1390,22 +1397,34 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) // // To improve JIT-time performance, we use some extra bitwise math to skip trying to flush // registers that can't have changed state during the current PPC instruction. + if (!bJITRegisterCacheOff) { gpr.DiscardRegisters(op.gprDiscardable); fpr.DiscardRegisters(op.fprDiscardable); gpr.DiscardCRRegisters(op.crDiscardable); } - gpr.FlushRegisters(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut), - FlushMode::Full); - fpr.FlushRegisters(~(op.fprWillBeRead | op.fprWillBeWritten) & - (op.fregsIn | op.GetFregsOut()), - FlushMode::Full); - gpr.FlushCRRegisters(~(op.crWillBeRead | op.crWillBeWritten) & (op.crIn | op.crOut), - FlushMode::Full); - gpr.FlushRegisters(~op.gprWillBeWritten & op.regsOut, FlushMode::Undirty); - fpr.FlushRegisters(~op.fprWillBeWritten & op.GetFregsOut(), FlushMode::Undirty); - gpr.FlushCRRegisters(~op.crWillBeWritten & op.crOut, FlushMode::Undirty); + + const BitSet32 gpr_will_be_used = op.gprWillBeRead | op.gprWillBeWritten; + const BitSet32 fpr_will_be_used = op.fprWillBeRead | op.fprWillBeWritten; + const BitSet8 cr_will_be_used = op.crWillBeRead | op.crWillBeWritten; + + gpr.FlushRegisters(~op.gprWillBeWritten & previous_op_gpr_will_be_written, + FlushMode::Undirty); + fpr.FlushRegisters(~op.fprWillBeWritten & previous_op_fpr_will_be_written, + FlushMode::Undirty); + gpr.FlushCRRegisters(~op.crWillBeWritten & previous_op_cr_will_be_written, + FlushMode::Undirty); + gpr.FlushRegisters(~gpr_will_be_used & previous_op_gpr_will_be_used, FlushMode::Full); + fpr.FlushRegisters(~fpr_will_be_used & previous_op_fpr_will_be_used, FlushMode::Full); + gpr.FlushCRRegisters(~cr_will_be_used & previous_op_cr_will_be_used, FlushMode::Full); + + previous_op_gpr_will_be_written = op.gprWillBeWritten; + previous_op_gpr_will_be_used = gpr_will_be_used; + previous_op_fpr_will_be_written = op.fprWillBeWritten; + previous_op_fpr_will_be_used = fpr_will_be_used; + previous_op_cr_will_be_written = op.crWillBeWritten; + previous_op_cr_will_be_used = cr_will_be_used; if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 54f5d3b9bc..3a95f54d7b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -559,7 +559,7 @@ void JitArm64::lmw(UGeckoInstruction inst) } } - BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten & BitSet32(0xFFFFFFFFU << d); + const BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten; BitSet32 gprs_to_flush = ~(js.op->gprWillBeWritten | js.op->gprWillBeRead); if (a_is_addr_base_reg) @@ -602,17 +602,26 @@ void JitArm64::lmw(UGeckoInstruction inst) else if (gprs_to_undirty[i]) { BitSet32 gprs_to_undirty_this_time{}; - if (i != 0 && gprs_to_undirty[i - 1]) + if (i != 0 && js.gpa.store_pairs[i - 1]) + { + // This is the second half of a pair. Store both registers using a single STP instruction. gprs_to_undirty_this_time = BitSet32{int(i - 1), int(i)}; - else if (i == 31 || !gprs_to_undirty[i + 1]) + } + else if (!js.gpa.store_pairs[i]) + { + // This isn't the first half of a pair, and we also know from earlier that it isn't the + // second half of a pair. Just store the register on its own. gprs_to_undirty_this_time = BitSet32{int(i)}; + } else + { + // This must be the first half of a pair. It will be flushed the next loop iteration. continue; + } gpr.FlushRegisters(gprs_to_undirty_this_time, FlushMode::Undirty, ARM64Reg::INVALID_REG); gpr.FlushRegisters(gprs_to_undirty_this_time & gprs_to_flush, FlushMode::Full, ARM64Reg::INVALID_REG); - gprs_to_undirty &= ~gprs_to_undirty_this_time; } } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 22bacc9271..8b04db164e 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -1186,6 +1186,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, } #ifdef _M_ARM_64 + // Make JitArm64 wait with storing one half of a pair until the other half is ready to be stored + op.gprWillBeWritten |= (op.gprWillBeWritten & block->m_gpa->store_pairs) << 1 | + ((op.gprWillBeWritten >> 1) & block->m_gpa->store_pairs); + // Equivalent calculations for fprWillBeWritten and crWillBeWritten are left out because + // JitArm64 isn't able to use STP when flushing those + // As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction // that's early in a block has two adjacent registers as inputs, prefer putting those registers // in the same load pair. This is intended to let the host CPU start doing useful work as soon @@ -1208,6 +1214,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, block->m_gqr_used = gqrUsed; block->m_gqr_modified = gqrModified; block->m_gpr_inputs = gprWillBeRead; + block->m_gpr_outputs = gprWillBeWritten; + block->m_fpr_inputs = fprWillBeRead; + block->m_fpr_outputs = fprWillBeWritten; + block->m_cr_inputs = crWillBeRead; + block->m_cr_outputs = crWillBeWritten; #ifdef _M_ARM_64 OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 71cc2d316c..3b45f21beb 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -139,6 +139,21 @@ struct CodeBlock // Which GPRs this block reads from before writing to, if any. BitSet32 m_gpr_inputs; + // Which GPRs this block writes to, if any. + BitSet32 m_gpr_outputs; + + // Which FPRs this block reads from before writing to, if any. + BitSet32 m_fpr_inputs; + + // Which FPRs this block writes to, if any. + BitSet32 m_fpr_outputs; + + // Which CRs this block reads from before writing to, if any. + BitSet8 m_cr_inputs; + + // Which CRs this block writes to, if any. + BitSet8 m_cr_outputs; + // Which memory locations are occupied by this block. std::set m_physical_addresses; };