JitArm64: Use STP for paired m_ppc_state GPRs

Arm64GPRCache already uses STP when flushing two adjacent registers in
the same FlushRegisters call, so we can accomplish this by tweaking
PPCAnalyst so that registers we've paired get flushed at the same time.

I had to rework the bitwise logic in Jit.cpp and the STP logic in the
lwm implementation, since there is no longer any guarantee that every
register that goes "out of use" at a given instruction is actually used
by that instruction.
This commit is contained in:
JosJuice 2023-12-03 14:51:50 +01:00
parent 22da41ff13
commit a18aa5c83a
4 changed files with 68 additions and 14 deletions

View File

@ -1180,6 +1180,13 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
IntializeSpeculativeConstants();
}
BitSet32 previous_op_gpr_will_be_written = code_block.m_gpr_outputs;
BitSet32 previous_op_gpr_will_be_used = code_block.m_gpr_inputs | previous_op_gpr_will_be_written;
BitSet32 previous_op_fpr_will_be_written = code_block.m_fpr_outputs;
BitSet32 previous_op_fpr_will_be_used = code_block.m_fpr_inputs | previous_op_fpr_will_be_written;
BitSet8 previous_op_cr_will_be_written = code_block.m_cr_outputs;
BitSet8 previous_op_cr_will_be_used = code_block.m_cr_inputs | previous_op_cr_will_be_written;
// Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
@ -1390,22 +1397,34 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
//
// To improve JIT-time performance, we use some extra bitwise math to skip trying to flush
// registers that can't have changed state during the current PPC instruction.
if (!bJITRegisterCacheOff)
{
gpr.DiscardRegisters(op.gprDiscardable);
fpr.DiscardRegisters(op.fprDiscardable);
gpr.DiscardCRRegisters(op.crDiscardable);
}
gpr.FlushRegisters(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut),
FlushMode::Full);
fpr.FlushRegisters(~(op.fprWillBeRead | op.fprWillBeWritten) &
(op.fregsIn | op.GetFregsOut()),
FlushMode::Full);
gpr.FlushCRRegisters(~(op.crWillBeRead | op.crWillBeWritten) & (op.crIn | op.crOut),
FlushMode::Full);
gpr.FlushRegisters(~op.gprWillBeWritten & op.regsOut, FlushMode::Undirty);
fpr.FlushRegisters(~op.fprWillBeWritten & op.GetFregsOut(), FlushMode::Undirty);
gpr.FlushCRRegisters(~op.crWillBeWritten & op.crOut, FlushMode::Undirty);
const BitSet32 gpr_will_be_used = op.gprWillBeRead | op.gprWillBeWritten;
const BitSet32 fpr_will_be_used = op.fprWillBeRead | op.fprWillBeWritten;
const BitSet8 cr_will_be_used = op.crWillBeRead | op.crWillBeWritten;
gpr.FlushRegisters(~op.gprWillBeWritten & previous_op_gpr_will_be_written,
FlushMode::Undirty);
fpr.FlushRegisters(~op.fprWillBeWritten & previous_op_fpr_will_be_written,
FlushMode::Undirty);
gpr.FlushCRRegisters(~op.crWillBeWritten & previous_op_cr_will_be_written,
FlushMode::Undirty);
gpr.FlushRegisters(~gpr_will_be_used & previous_op_gpr_will_be_used, FlushMode::Full);
fpr.FlushRegisters(~fpr_will_be_used & previous_op_fpr_will_be_used, FlushMode::Full);
gpr.FlushCRRegisters(~cr_will_be_used & previous_op_cr_will_be_used, FlushMode::Full);
previous_op_gpr_will_be_written = op.gprWillBeWritten;
previous_op_gpr_will_be_used = gpr_will_be_used;
previous_op_fpr_will_be_written = op.fprWillBeWritten;
previous_op_fpr_will_be_used = fpr_will_be_used;
previous_op_cr_will_be_written = op.crWillBeWritten;
previous_op_cr_will_be_used = cr_will_be_used;
if (opinfo->flags & FL_LOADSTORE)
++js.numLoadStoreInst;

View File

@ -559,7 +559,7 @@ void JitArm64::lmw(UGeckoInstruction inst)
}
}
BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten & BitSet32(0xFFFFFFFFU << d);
const BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten;
BitSet32 gprs_to_flush = ~(js.op->gprWillBeWritten | js.op->gprWillBeRead);
if (a_is_addr_base_reg)
@ -602,17 +602,26 @@ void JitArm64::lmw(UGeckoInstruction inst)
else if (gprs_to_undirty[i])
{
BitSet32 gprs_to_undirty_this_time{};
if (i != 0 && gprs_to_undirty[i - 1])
if (i != 0 && js.gpa.store_pairs[i - 1])
{
// This is the second half of a pair. Store both registers using a single STP instruction.
gprs_to_undirty_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !gprs_to_undirty[i + 1])
}
else if (!js.gpa.store_pairs[i])
{
// This isn't the first half of a pair, and we also know from earlier that it isn't the
// second half of a pair. Just store the register on its own.
gprs_to_undirty_this_time = BitSet32{int(i)};
}
else
{
// This must be the first half of a pair. It will be flushed the next loop iteration.
continue;
}
gpr.FlushRegisters(gprs_to_undirty_this_time, FlushMode::Undirty, ARM64Reg::INVALID_REG);
gpr.FlushRegisters(gprs_to_undirty_this_time & gprs_to_flush, FlushMode::Full,
ARM64Reg::INVALID_REG);
gprs_to_undirty &= ~gprs_to_undirty_this_time;
}
}

View File

@ -1186,6 +1186,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
}
#ifdef _M_ARM_64
// Make JitArm64 wait with storing one half of a pair until the other half is ready to be stored
op.gprWillBeWritten |= (op.gprWillBeWritten & block->m_gpa->store_pairs) << 1 |
((op.gprWillBeWritten >> 1) & block->m_gpa->store_pairs);
// Equivalent calculations for fprWillBeWritten and crWillBeWritten are left out because
// JitArm64 isn't able to use STP when flushing those
// As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction
// that's early in a block has two adjacent registers as inputs, prefer putting those registers
// in the same load pair. This is intended to let the host CPU start doing useful work as soon
@ -1208,6 +1214,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
block->m_gpr_inputs = gprWillBeRead;
block->m_gpr_outputs = gprWillBeWritten;
block->m_fpr_inputs = fprWillBeRead;
block->m_fpr_outputs = fprWillBeWritten;
block->m_cr_inputs = crWillBeRead;
block->m_cr_outputs = crWillBeWritten;
#ifdef _M_ARM_64
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);

View File

@ -139,6 +139,21 @@ struct CodeBlock
// Which GPRs this block reads from before writing to, if any.
BitSet32 m_gpr_inputs;
// Which GPRs this block writes to, if any.
BitSet32 m_gpr_outputs;
// Which FPRs this block reads from before writing to, if any.
BitSet32 m_fpr_inputs;
// Which FPRs this block writes to, if any.
BitSet32 m_fpr_outputs;
// Which CRs this block reads from before writing to, if any.
BitSet8 m_cr_inputs;
// Which CRs this block writes to, if any.
BitSet8 m_cr_outputs;
// Which memory locations are occupied by this block.
std::set<u32> m_physical_addresses;
};