mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-12-16 04:09:39 +00:00
JitArm64: Use STP for paired m_ppc_state GPRs
Arm64GPRCache already uses STP when flushing two adjacent registers in the same FlushRegisters call, so we can accomplish this by tweaking PPCAnalyst so that registers we've paired get flushed at the same time. I had to rework the bitwise logic in Jit.cpp and the STP logic in the lwm implementation, since there is no longer any guarantee that every register that goes "out of use" at a given instruction is actually used by that instruction.
This commit is contained in:
parent
22da41ff13
commit
a18aa5c83a
@ -1180,6 +1180,13 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
|
||||
IntializeSpeculativeConstants();
|
||||
}
|
||||
|
||||
BitSet32 previous_op_gpr_will_be_written = code_block.m_gpr_outputs;
|
||||
BitSet32 previous_op_gpr_will_be_used = code_block.m_gpr_inputs | previous_op_gpr_will_be_written;
|
||||
BitSet32 previous_op_fpr_will_be_written = code_block.m_fpr_outputs;
|
||||
BitSet32 previous_op_fpr_will_be_used = code_block.m_fpr_inputs | previous_op_fpr_will_be_written;
|
||||
BitSet8 previous_op_cr_will_be_written = code_block.m_cr_outputs;
|
||||
BitSet8 previous_op_cr_will_be_used = code_block.m_cr_inputs | previous_op_cr_will_be_written;
|
||||
|
||||
// Translate instructions
|
||||
for (u32 i = 0; i < code_block.m_num_instructions; i++)
|
||||
{
|
||||
@ -1390,22 +1397,34 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
|
||||
//
|
||||
// To improve JIT-time performance, we use some extra bitwise math to skip trying to flush
|
||||
// registers that can't have changed state during the current PPC instruction.
|
||||
|
||||
if (!bJITRegisterCacheOff)
|
||||
{
|
||||
gpr.DiscardRegisters(op.gprDiscardable);
|
||||
fpr.DiscardRegisters(op.fprDiscardable);
|
||||
gpr.DiscardCRRegisters(op.crDiscardable);
|
||||
}
|
||||
gpr.FlushRegisters(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut),
|
||||
FlushMode::Full);
|
||||
fpr.FlushRegisters(~(op.fprWillBeRead | op.fprWillBeWritten) &
|
||||
(op.fregsIn | op.GetFregsOut()),
|
||||
FlushMode::Full);
|
||||
gpr.FlushCRRegisters(~(op.crWillBeRead | op.crWillBeWritten) & (op.crIn | op.crOut),
|
||||
FlushMode::Full);
|
||||
gpr.FlushRegisters(~op.gprWillBeWritten & op.regsOut, FlushMode::Undirty);
|
||||
fpr.FlushRegisters(~op.fprWillBeWritten & op.GetFregsOut(), FlushMode::Undirty);
|
||||
gpr.FlushCRRegisters(~op.crWillBeWritten & op.crOut, FlushMode::Undirty);
|
||||
|
||||
const BitSet32 gpr_will_be_used = op.gprWillBeRead | op.gprWillBeWritten;
|
||||
const BitSet32 fpr_will_be_used = op.fprWillBeRead | op.fprWillBeWritten;
|
||||
const BitSet8 cr_will_be_used = op.crWillBeRead | op.crWillBeWritten;
|
||||
|
||||
gpr.FlushRegisters(~op.gprWillBeWritten & previous_op_gpr_will_be_written,
|
||||
FlushMode::Undirty);
|
||||
fpr.FlushRegisters(~op.fprWillBeWritten & previous_op_fpr_will_be_written,
|
||||
FlushMode::Undirty);
|
||||
gpr.FlushCRRegisters(~op.crWillBeWritten & previous_op_cr_will_be_written,
|
||||
FlushMode::Undirty);
|
||||
gpr.FlushRegisters(~gpr_will_be_used & previous_op_gpr_will_be_used, FlushMode::Full);
|
||||
fpr.FlushRegisters(~fpr_will_be_used & previous_op_fpr_will_be_used, FlushMode::Full);
|
||||
gpr.FlushCRRegisters(~cr_will_be_used & previous_op_cr_will_be_used, FlushMode::Full);
|
||||
|
||||
previous_op_gpr_will_be_written = op.gprWillBeWritten;
|
||||
previous_op_gpr_will_be_used = gpr_will_be_used;
|
||||
previous_op_fpr_will_be_written = op.fprWillBeWritten;
|
||||
previous_op_fpr_will_be_used = fpr_will_be_used;
|
||||
previous_op_cr_will_be_written = op.crWillBeWritten;
|
||||
previous_op_cr_will_be_used = cr_will_be_used;
|
||||
|
||||
if (opinfo->flags & FL_LOADSTORE)
|
||||
++js.numLoadStoreInst;
|
||||
|
||||
@ -559,7 +559,7 @@ void JitArm64::lmw(UGeckoInstruction inst)
|
||||
}
|
||||
}
|
||||
|
||||
BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten & BitSet32(0xFFFFFFFFU << d);
|
||||
const BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten;
|
||||
|
||||
BitSet32 gprs_to_flush = ~(js.op->gprWillBeWritten | js.op->gprWillBeRead);
|
||||
if (a_is_addr_base_reg)
|
||||
@ -602,17 +602,26 @@ void JitArm64::lmw(UGeckoInstruction inst)
|
||||
else if (gprs_to_undirty[i])
|
||||
{
|
||||
BitSet32 gprs_to_undirty_this_time{};
|
||||
if (i != 0 && gprs_to_undirty[i - 1])
|
||||
if (i != 0 && js.gpa.store_pairs[i - 1])
|
||||
{
|
||||
// This is the second half of a pair. Store both registers using a single STP instruction.
|
||||
gprs_to_undirty_this_time = BitSet32{int(i - 1), int(i)};
|
||||
else if (i == 31 || !gprs_to_undirty[i + 1])
|
||||
}
|
||||
else if (!js.gpa.store_pairs[i])
|
||||
{
|
||||
// This isn't the first half of a pair, and we also know from earlier that it isn't the
|
||||
// second half of a pair. Just store the register on its own.
|
||||
gprs_to_undirty_this_time = BitSet32{int(i)};
|
||||
}
|
||||
else
|
||||
{
|
||||
// This must be the first half of a pair. It will be flushed the next loop iteration.
|
||||
continue;
|
||||
}
|
||||
|
||||
gpr.FlushRegisters(gprs_to_undirty_this_time, FlushMode::Undirty, ARM64Reg::INVALID_REG);
|
||||
gpr.FlushRegisters(gprs_to_undirty_this_time & gprs_to_flush, FlushMode::Full,
|
||||
ARM64Reg::INVALID_REG);
|
||||
gprs_to_undirty &= ~gprs_to_undirty_this_time;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1186,6 +1186,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
|
||||
}
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
// Make JitArm64 wait with storing one half of a pair until the other half is ready to be stored
|
||||
op.gprWillBeWritten |= (op.gprWillBeWritten & block->m_gpa->store_pairs) << 1 |
|
||||
((op.gprWillBeWritten >> 1) & block->m_gpa->store_pairs);
|
||||
// Equivalent calculations for fprWillBeWritten and crWillBeWritten are left out because
|
||||
// JitArm64 isn't able to use STP when flushing those
|
||||
|
||||
// As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction
|
||||
// that's early in a block has two adjacent registers as inputs, prefer putting those registers
|
||||
// in the same load pair. This is intended to let the host CPU start doing useful work as soon
|
||||
@ -1208,6 +1214,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
|
||||
block->m_gqr_used = gqrUsed;
|
||||
block->m_gqr_modified = gqrModified;
|
||||
block->m_gpr_inputs = gprWillBeRead;
|
||||
block->m_gpr_outputs = gprWillBeWritten;
|
||||
block->m_fpr_inputs = fprWillBeRead;
|
||||
block->m_fpr_outputs = fprWillBeWritten;
|
||||
block->m_cr_inputs = crWillBeRead;
|
||||
block->m_cr_outputs = crWillBeWritten;
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);
|
||||
|
||||
@ -139,6 +139,21 @@ struct CodeBlock
|
||||
// Which GPRs this block reads from before writing to, if any.
|
||||
BitSet32 m_gpr_inputs;
|
||||
|
||||
// Which GPRs this block writes to, if any.
|
||||
BitSet32 m_gpr_outputs;
|
||||
|
||||
// Which FPRs this block reads from before writing to, if any.
|
||||
BitSet32 m_fpr_inputs;
|
||||
|
||||
// Which FPRs this block writes to, if any.
|
||||
BitSet32 m_fpr_outputs;
|
||||
|
||||
// Which CRs this block reads from before writing to, if any.
|
||||
BitSet8 m_cr_inputs;
|
||||
|
||||
// Which CRs this block writes to, if any.
|
||||
BitSet8 m_cr_outputs;
|
||||
|
||||
// Which memory locations are occupied by this block.
|
||||
std::set<u32> m_physical_addresses;
|
||||
};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user