mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-12-23 06:57:08 +00:00
JitArm64: Add analysis for m_ppc_state LDP/STP
Using LDP/STP when accessing m_ppc_state lets us load/store two registers at once. We previously opportunistically used STP, but this new analysis lets us move loads earlier and move stores later to make use of LDP/STP in more situations. This reduces code size and time spent on m_ppc_state accesses, possibly with exceptions when under heavy register pressure. This commit adds the new bitsets load_pairs and store_pairs to BlockRegStats, which indicate which registers should be treated as a pair when loading and storing to m_ppc_state. The commits after this one will add code that reads these bitsets to determine when to use LDP/STP. x64 doesn't have instructions that can load/store a pair of registers, so the new PPCAnalyst code is ifdef'd for AArch64.
This commit is contained in:
parent
faaf25d6c9
commit
db5aa34677
@ -4,6 +4,7 @@
|
||||
#include "Core/PowerPC/PPCAnalyst.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <bit>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
@ -809,6 +810,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
|
||||
// Clear register stats
|
||||
block->m_gpa->any = true;
|
||||
block->m_fpa->any = false;
|
||||
#ifdef _M_ARM_64
|
||||
block->m_gpa->load_pairs = {};
|
||||
block->m_gpa->store_pairs = {};
|
||||
block->m_fpa->load_pairs = {};
|
||||
block->m_fpa->store_pairs = {};
|
||||
#endif
|
||||
|
||||
// Set the blocks start address
|
||||
block->m_address = address;
|
||||
@ -1068,6 +1075,24 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
BitSet32 gpr_load_pair_candidates = gprWillBeRead;
|
||||
FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs);
|
||||
|
||||
BitSet32 fpr_load_pair_candidates = fprWillBeRead;
|
||||
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
|
||||
|
||||
BitSet32 gpr_store_pair_candidates = gprWillBeWritten;
|
||||
FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs);
|
||||
OddLengthRunsToEvenLengthRuns(&gpr_store_pair_candidates);
|
||||
FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs);
|
||||
|
||||
BitSet32 fpr_store_pair_candidates = fprWillBeWritten;
|
||||
FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs);
|
||||
OddLengthRunsToEvenLengthRuns(&fpr_store_pair_candidates);
|
||||
FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs);
|
||||
#endif
|
||||
|
||||
// Forward scan, for flags that need the other direction for calculation.
|
||||
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
|
||||
BitSet8 gqrUsed, gqrModified;
|
||||
@ -1159,11 +1184,73 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
|
||||
if (gqr >= 0 && gqr <= 7)
|
||||
gqrModified[gqr] = true;
|
||||
}
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
// As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction
|
||||
// that's early in a block has two adjacent registers as inputs, prefer putting those registers
|
||||
// in the same load pair. This is intended to let the host CPU start doing useful work as soon
|
||||
// as possible.
|
||||
if (FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs, op.regsIn) != 0)
|
||||
{
|
||||
// If the odd-length run was long, it will now have been split into two shorter runs, with a
|
||||
// gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again.
|
||||
FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs);
|
||||
}
|
||||
if (FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs, op.fregsIn) != 0)
|
||||
{
|
||||
// If the odd-length run was long, it will now have been split into two shorter runs, with a
|
||||
// gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again.
|
||||
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
block->m_gqr_used = gqrUsed;
|
||||
block->m_gqr_modified = gqrModified;
|
||||
block->m_gpr_inputs = gprWillBeRead;
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);
|
||||
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
|
||||
|
||||
OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates);
|
||||
FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs);
|
||||
#endif
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask)
|
||||
{
|
||||
u64 candidates_to_check = candidates->m_val & mask.m_val;
|
||||
size_t shift = 32;
|
||||
size_t registers_handled = 0;
|
||||
|
||||
while (candidates_to_check != 0)
|
||||
{
|
||||
const int zero_count = std::countl_zero(u32(candidates_to_check));
|
||||
shift -= zero_count;
|
||||
candidates_to_check <<= zero_count;
|
||||
|
||||
const int one_count = std::countl_one(u32(candidates_to_check));
|
||||
shift -= one_count;
|
||||
candidates_to_check <<= one_count;
|
||||
|
||||
if ((one_count & 1) == 0)
|
||||
{
|
||||
const u32 ones = static_cast<u32>(((1ULL << one_count) - 1));
|
||||
*candidates &= ~BitSet32(ones << shift);
|
||||
*out |= BitSet32((ones & 0x55555555) << shift);
|
||||
registers_handled += ones;
|
||||
}
|
||||
}
|
||||
|
||||
return registers_handled;
|
||||
}
|
||||
|
||||
void OddLengthRunsToEvenLengthRuns(BitSet32* candidates)
|
||||
{
|
||||
*candidates &= *candidates >> 1;
|
||||
}
|
||||
|
||||
} // namespace PPCAnalyst
|
||||
|
||||
@ -96,6 +96,14 @@ struct BlockStats
|
||||
struct BlockRegStats
|
||||
{
|
||||
bool any;
|
||||
|
||||
#ifdef _M_ARM_64
|
||||
// For JITs that can load and store pairs of adjacent registers in one operation, each set bit in
|
||||
// these bitsets provides a hint that that register and the register immediately afterwards are a
|
||||
// good candidate for being loaded/stored as a pair.
|
||||
BitSet32 load_pairs;
|
||||
BitSet32 store_pairs;
|
||||
#endif
|
||||
};
|
||||
|
||||
using CodeBuffer = std::vector<CodeOp>;
|
||||
@ -128,7 +136,7 @@ struct CodeBlock
|
||||
// Which GQRs this block modifies, if any.
|
||||
BitSet8 m_gqr_modified;
|
||||
|
||||
// Which GPRs this block reads from before defining, if any.
|
||||
// Which GPRs this block reads from before writing to, if any.
|
||||
BitSet32 m_gpr_inputs;
|
||||
|
||||
// Which memory locations are occupied by this block.
|
||||
@ -217,4 +225,20 @@ bool AnalyzeFunction(const Core::CPUThreadGuard& guard, u32 startAddr, Common::S
|
||||
bool ReanalyzeFunction(const Core::CPUThreadGuard& guard, u32 start_addr, Common::Symbol& func,
|
||||
u32 max_size = 0);
|
||||
|
||||
// The below functions are for internal use, but are exposed for the sake of unit tests.
|
||||
|
||||
// For each even-length run of bits in candidates & mask, unsets those bits in candidates, and
|
||||
// sets every second corresponding bit in out (corresponding to the first half of each pair).
|
||||
// Returns the number of bits that were removed from candidates.
|
||||
//
|
||||
// Odd-length runs are left for later, because how to assign pairs is only unambiguous for
|
||||
// even-length runs. To arbitrarily choose an assignment of pairs for odd-length runs,
|
||||
// call OddLengthRunsToEvenLengthRuns after calling this, then call this again.
|
||||
size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask = BitSet32(0xFFFFFFFF));
|
||||
|
||||
// Discards one bit from each run of bits, turning odd-length runs into even-length runs.
|
||||
// (Also turns even-length runs into odd-length runs, which is probably not something you want.
|
||||
// You should remove all even-length runs before calling this.)
|
||||
void OddLengthRunsToEvenLengthRuns(BitSet32* candidates);
|
||||
|
||||
} // namespace PPCAnalyst
|
||||
|
||||
@ -23,6 +23,7 @@ if(_M_X86_64)
|
||||
PowerPC/DivUtilsTest.cpp
|
||||
PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
|
||||
PowerPC/Jit64Common/Frsqrte.cpp
|
||||
PowerPC/PPCAnalystTest.cpp
|
||||
)
|
||||
elseif(_M_ARM_64)
|
||||
add_dolphin_test(PowerPCTest
|
||||
@ -32,10 +33,12 @@ elseif(_M_ARM_64)
|
||||
PowerPC/JitArm64/Fres.cpp
|
||||
PowerPC/JitArm64/Frsqrte.cpp
|
||||
PowerPC/JitArm64/MovI2R.cpp
|
||||
PowerPC/PPCAnalystTest.cpp
|
||||
)
|
||||
else()
|
||||
add_dolphin_test(PowerPCTest
|
||||
PowerPC/DivUtilsTest.cpp
|
||||
PowerPC/PPCAnalystTest.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
57
Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp
Normal file
57
Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
// Copyright 2021 Dolphin Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "Common/BitSet.h"
|
||||
#include "Core/PowerPC/PPCAnalyst.h"
|
||||
|
||||
TEST(PPCAnalyst, FindRegisterPairs)
|
||||
{
|
||||
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
|
||||
BitSet32 output{15, 18, 30};
|
||||
BitSet32 input_expected{1, 6, 7, 8, 19, 20, 21, 22, 23};
|
||||
BitSet32 output_expected{3, 11, 13, 15, 18, 30};
|
||||
|
||||
PPCAnalyst::FindRegisterPairs(&input, &output);
|
||||
|
||||
ASSERT_EQ(input, input_expected);
|
||||
ASSERT_EQ(output, output_expected);
|
||||
}
|
||||
|
||||
TEST(PPCAnalyst, FindRegisterPairs_AllOnes)
|
||||
{
|
||||
BitSet32 input(0xFFFFFFFF);
|
||||
BitSet32 output(0x0);
|
||||
BitSet32 input_expected(0x0);
|
||||
BitSet32 output_expected(0x55555555);
|
||||
|
||||
PPCAnalyst::FindRegisterPairs(&input, &output);
|
||||
|
||||
ASSERT_EQ(input, input_expected);
|
||||
ASSERT_EQ(output, output_expected);
|
||||
}
|
||||
|
||||
TEST(PPCAnalyst, FindRegisterPairs_Masked)
|
||||
{
|
||||
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
|
||||
BitSet32 output{15, 18, 30};
|
||||
BitSet32 mask{1, 8, 9, 21, 22};
|
||||
BitSet32 input_expected{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 23};
|
||||
BitSet32 output_expected{15, 18, 21, 30};
|
||||
|
||||
PPCAnalyst::FindRegisterPairs(&input, &output, mask);
|
||||
|
||||
ASSERT_EQ(input, input_expected);
|
||||
ASSERT_EQ(output, output_expected);
|
||||
}
|
||||
|
||||
TEST(PPCAnalyst, OddLengthRunsToEvenLengthRuns)
|
||||
{
|
||||
BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23};
|
||||
BitSet32 expected{3, 6, 7, 11, 12, 13, 19, 20, 21, 22};
|
||||
|
||||
PPCAnalyst::OddLengthRunsToEvenLengthRuns(&input);
|
||||
|
||||
ASSERT_EQ(input, expected);
|
||||
}
|
||||
@ -74,6 +74,7 @@
|
||||
<ClCompile Include="Core\PageFaultTest.cpp" />
|
||||
<ClCompile Include="Core\PatchAllowlistTest.cpp" />
|
||||
<ClCompile Include="Core\PowerPC\DivUtilsTest.cpp" />
|
||||
<ClCompile Include="Core\PowerPC\PPCAnalystTest.cpp" />
|
||||
<ClCompile Include="VideoCommon\VertexLoaderTest.cpp" />
|
||||
<ClCompile Include="StubHost.cpp" />
|
||||
</ItemGroup>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user