JitArm64: Add analysis for m_ppc_state LDP/STP

Using LDP/STP when accessing m_ppc_state lets us load/store two
registers at once. We previously opportunistically used STP, but this
new analysis lets us move loads earlier and move stores later to make
use of LDP/STP in more situations. This reduces code size and time spent
on m_ppc_state accesses, possibly with exceptions when under heavy
register pressure.

This commit adds the new bitsets load_pairs and store_pairs to
BlockRegStats, which indicate which registers should be treated as a
pair when loading and storing to m_ppc_state. The commits after this one
will add code that reads these bitsets to determine when to use LDP/STP.

x64 doesn't have instructions that can load/store a pair of registers,
so the new PPCAnalyst code is ifdef'd for AArch64.
This commit is contained in:
JosJuice 2023-07-31 14:14:33 +02:00
parent faaf25d6c9
commit db5aa34677
5 changed files with 173 additions and 1 deletions

View File

@ -4,6 +4,7 @@
#include "Core/PowerPC/PPCAnalyst.h"
#include <algorithm>
#include <bit>
#include <map>
#include <queue>
#include <string>
@ -809,6 +810,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
// Clear register stats
block->m_gpa->any = true;
block->m_fpa->any = false;
#ifdef _M_ARM_64
block->m_gpa->load_pairs = {};
block->m_gpa->store_pairs = {};
block->m_fpa->load_pairs = {};
block->m_fpa->store_pairs = {};
#endif
// Set the blocks start address
block->m_address = address;
@ -1068,6 +1075,24 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
}
}
#ifdef _M_ARM_64
BitSet32 gpr_load_pair_candidates = gprWillBeRead;
FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs);
BitSet32 fpr_load_pair_candidates = fprWillBeRead;
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
BitSet32 gpr_store_pair_candidates = gprWillBeWritten;
FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs);
OddLengthRunsToEvenLengthRuns(&gpr_store_pair_candidates);
FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs);
BitSet32 fpr_store_pair_candidates = fprWillBeWritten;
FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs);
OddLengthRunsToEvenLengthRuns(&fpr_store_pair_candidates);
FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs);
#endif
// Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
BitSet8 gqrUsed, gqrModified;
@ -1159,11 +1184,73 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
if (gqr >= 0 && gqr <= 7)
gqrModified[gqr] = true;
}
#ifdef _M_ARM_64
// As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction
// that's early in a block has two adjacent registers as inputs, prefer putting those registers
// in the same load pair. This is intended to let the host CPU start doing useful work as soon
// as possible.
if (FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs, op.regsIn) != 0)
{
// If the odd-length run was long, it will now have been split into two shorter runs, with a
// gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again.
FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs);
}
if (FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs, op.fregsIn) != 0)
{
// If the odd-length run was long, it will now have been split into two shorter runs, with a
// gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again.
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
}
#endif
}
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
block->m_gpr_inputs = gprWillBeRead;
#ifdef _M_ARM_64
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
#endif
return address;
}
size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask)
{
u64 candidates_to_check = candidates->m_val & mask.m_val;
size_t shift = 32;
size_t registers_handled = 0;
while (candidates_to_check != 0)
{
const int zero_count = std::countl_zero(u32(candidates_to_check));
shift -= zero_count;
candidates_to_check <<= zero_count;
const int one_count = std::countl_one(u32(candidates_to_check));
shift -= one_count;
candidates_to_check <<= one_count;
if ((one_count & 1) == 0)
{
const u32 ones = static_cast<u32>(((1ULL << one_count) - 1));
*candidates &= ~BitSet32(ones << shift);
*out |= BitSet32((ones & 0x55555555) << shift);
registers_handled += ones;
}
}
return registers_handled;
}
void OddLengthRunsToEvenLengthRuns(BitSet32* candidates)
{
*candidates &= *candidates >> 1;
}
} // namespace PPCAnalyst

View File

@ -96,6 +96,14 @@ struct BlockStats
struct BlockRegStats
{
bool any;
#ifdef _M_ARM_64
// For JITs that can load and store pairs of adjacent registers in one operation, each set bit in
// these bitsets provides a hint that that register and the register immediately afterwards are a
// good candidate for being loaded/stored as a pair.
BitSet32 load_pairs;
BitSet32 store_pairs;
#endif
};
using CodeBuffer = std::vector<CodeOp>;
@ -128,7 +136,7 @@ struct CodeBlock
// Which GQRs this block modifies, if any.
BitSet8 m_gqr_modified;
// Which GPRs this block reads from before defining, if any.
// Which GPRs this block reads from before writing to, if any.
BitSet32 m_gpr_inputs;
// Which memory locations are occupied by this block.
@ -217,4 +225,20 @@ bool AnalyzeFunction(const Core::CPUThreadGuard& guard, u32 startAddr, Common::S
bool ReanalyzeFunction(const Core::CPUThreadGuard& guard, u32 start_addr, Common::Symbol& func,
u32 max_size = 0);
// The below functions are for internal use, but are exposed for the sake of unit tests.
// For each even-length run of bits in candidates & mask, unsets those bits in candidates, and
// sets every second corresponding bit in out (corresponding to the first half of each pair).
// Returns the number of bits that were removed from candidates.
//
// Odd-length runs are left for later, because how to assign pairs is only unambiguous for
// even-length runs. To arbitrarily choose an assignment of pairs for odd-length runs,
// call OddLengthRunsToEvenLengthRuns after calling this, then call this again.
size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask = BitSet32(0xFFFFFFFF));
// Discards one bit from each run of bits, turning odd-length runs into even-length runs.
// (Also turns even-length runs into odd-length runs, which is probably not something you want.
// You should remove all even-length runs before calling this.)
void OddLengthRunsToEvenLengthRuns(BitSet32* candidates);
} // namespace PPCAnalyst

View File

@ -23,6 +23,7 @@ if(_M_X86_64)
PowerPC/DivUtilsTest.cpp
PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
PowerPC/Jit64Common/Frsqrte.cpp
PowerPC/PPCAnalystTest.cpp
)
elseif(_M_ARM_64)
add_dolphin_test(PowerPCTest
@ -32,10 +33,12 @@ elseif(_M_ARM_64)
PowerPC/JitArm64/Fres.cpp
PowerPC/JitArm64/Frsqrte.cpp
PowerPC/JitArm64/MovI2R.cpp
PowerPC/PPCAnalystTest.cpp
)
else()
add_dolphin_test(PowerPCTest
PowerPC/DivUtilsTest.cpp
PowerPC/PPCAnalystTest.cpp
)
endif()

View File

@ -0,0 +1,57 @@
// Copyright 2021 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <gtest/gtest.h>
#include "Common/BitSet.h"
#include "Core/PowerPC/PPCAnalyst.h"
TEST(PPCAnalyst, FindRegisterPairs)
{
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
BitSet32 output{15, 18, 30};
BitSet32 input_expected{1, 6, 7, 8, 19, 20, 21, 22, 23};
BitSet32 output_expected{3, 11, 13, 15, 18, 30};
PPCAnalyst::FindRegisterPairs(&input, &output);
ASSERT_EQ(input, input_expected);
ASSERT_EQ(output, output_expected);
}
TEST(PPCAnalyst, FindRegisterPairs_AllOnes)
{
BitSet32 input(0xFFFFFFFF);
BitSet32 output(0x0);
BitSet32 input_expected(0x0);
BitSet32 output_expected(0x55555555);
PPCAnalyst::FindRegisterPairs(&input, &output);
ASSERT_EQ(input, input_expected);
ASSERT_EQ(output, output_expected);
}
TEST(PPCAnalyst, FindRegisterPairs_Masked)
{
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
BitSet32 output{15, 18, 30};
BitSet32 mask{1, 8, 9, 21, 22};
BitSet32 input_expected{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 23};
BitSet32 output_expected{15, 18, 21, 30};
PPCAnalyst::FindRegisterPairs(&input, &output, mask);
ASSERT_EQ(input, input_expected);
ASSERT_EQ(output, output_expected);
}
TEST(PPCAnalyst, OddLengthRunsToEvenLengthRuns)
{
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
BitSet32 expected{3, 6, 7, 11, 12, 13, 19, 20, 21, 22};
PPCAnalyst::OddLengthRunsToEvenLengthRuns(&input);
ASSERT_EQ(input, expected);
}

View File

@ -74,6 +74,7 @@
<ClCompile Include="Core\PageFaultTest.cpp" />
<ClCompile Include="Core\PatchAllowlistTest.cpp" />
<ClCompile Include="Core\PowerPC\DivUtilsTest.cpp" />
<ClCompile Include="Core\PowerPC\PPCAnalystTest.cpp" />
<ClCompile Include="VideoCommon\VertexLoaderTest.cpp" />
<ClCompile Include="StubHost.cpp" />
</ItemGroup>