From db5aa34677b0b60d117005b40d8df834e6d29054 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 31 Jul 2023 14:14:33 +0200 Subject: [PATCH] JitArm64: Add analysis for m_ppc_state LDP/STP Using LDP/STP when accessing m_ppc_state lets us load/store two registers at once. We previously opportunistically used STP, but this new analysis lets us move loads earlier and move stores later to make use of LDP/STP in more situations. This reduces code size and time spent on m_ppc_state accesses, possibly with exceptions when under heavy register pressure. This commit adds the new bitsets load_pairs and store_pairs to BlockRegStats, which indicate which registers should be treated as a pair when loading and storing to m_ppc_state. The commits after this one will add code that reads these bitsets to determine when to use LDP/STP. x64 doesn't have instructions that can load/store a pair of registers, so the new PPCAnalyst code is ifdef'd for AArch64. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 87 +++++++++++++++++++ Source/Core/Core/PowerPC/PPCAnalyst.h | 26 +++++- Source/UnitTests/Core/CMakeLists.txt | 3 + .../UnitTests/Core/PowerPC/PPCAnalystTest.cpp | 57 ++++++++++++ Source/UnitTests/UnitTests.vcxproj | 1 + 5 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index b2d4b2048a7..22bacc92718 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -4,6 +4,7 @@ #include "Core/PowerPC/PPCAnalyst.h" #include +#include #include #include #include @@ -809,6 +810,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, // Clear register stats block->m_gpa->any = true; block->m_fpa->any = false; +#ifdef _M_ARM_64 + block->m_gpa->load_pairs = {}; + block->m_gpa->store_pairs = {}; + block->m_fpa->load_pairs = {}; + block->m_fpa->store_pairs = {}; +#endif // Set the blocks start address block->m_address = address; @@ -1068,6 +1075,24 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, } } +#ifdef _M_ARM_64 + BitSet32 gpr_load_pair_candidates = gprWillBeRead; + FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs); + + BitSet32 fpr_load_pair_candidates = fprWillBeRead; + FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs); + + BitSet32 gpr_store_pair_candidates = gprWillBeWritten; + FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs); + OddLengthRunsToEvenLengthRuns(&gpr_store_pair_candidates); + FindRegisterPairs(&gpr_store_pair_candidates, &block->m_gpa->store_pairs); + + BitSet32 fpr_store_pair_candidates = fprWillBeWritten; + FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs); + OddLengthRunsToEvenLengthRuns(&fpr_store_pair_candidates); + FindRegisterPairs(&fpr_store_pair_candidates, &block->m_fpa->store_pairs); +#endif + // Forward scan, for flags that need the other direction for calculation. BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; BitSet8 gqrUsed, gqrModified; @@ -1159,11 +1184,73 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, if (gqr >= 0 && gqr <= 7) gqrModified[gqr] = true; } + +#ifdef _M_ARM_64 + // As a tie-break for odd-length runs of registers to assign load pairs for, if an instruction + // that's early in a block has two adjacent registers as inputs, prefer putting those registers + // in the same load pair. This is intended to let the host CPU start doing useful work as soon + // as possible. + if (FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs, op.regsIn) != 0) + { + // If the odd-length run was long, it will now have been split into two shorter runs, with a + // gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again. + FindRegisterPairs(&gpr_load_pair_candidates, &block->m_gpa->load_pairs); + } + if (FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs, op.fregsIn) != 0) + { + // If the odd-length run was long, it will now have been split into two shorter runs, with a + // gap in between. One of the new runs is even-length, so let's run FindRegisterPairs again. + FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs); + } +#endif } + block->m_gqr_used = gqrUsed; block->m_gqr_modified = gqrModified; block->m_gpr_inputs = gprWillBeRead; + +#ifdef _M_ARM_64 + OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates); + FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs); + + OddLengthRunsToEvenLengthRuns(&fpr_load_pair_candidates); + FindRegisterPairs(&fpr_load_pair_candidates, &block->m_fpa->load_pairs); +#endif + return address; } +size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask) +{ + u64 candidates_to_check = candidates->m_val & mask.m_val; + size_t shift = 32; + size_t registers_handled = 0; + + while (candidates_to_check != 0) + { + const int zero_count = std::countl_zero(u32(candidates_to_check)); + shift -= zero_count; + candidates_to_check <<= zero_count; + + const int one_count = std::countl_one(u32(candidates_to_check)); + shift -= one_count; + candidates_to_check <<= one_count; + + if ((one_count & 1) == 0) + { + const u32 ones = static_cast(((1ULL << one_count) - 1)); + *candidates &= ~BitSet32(ones << shift); + *out |= BitSet32((ones & 0x55555555) << shift); + registers_handled += ones; + } + } + + return registers_handled; +} + +void OddLengthRunsToEvenLengthRuns(BitSet32* candidates) +{ + *candidates &= *candidates >> 1; +} + } // namespace PPCAnalyst diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 1be4a1e9d83..71cc2d316c5 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -96,6 +96,14 @@ struct BlockStats struct BlockRegStats { bool any; + +#ifdef _M_ARM_64 + // For JITs that can load and store pairs of adjacent registers in one operation, each set bit in + // these bitsets provides a hint that that register and the register immediately afterwards are a + // good candidate for being loaded/stored as a pair. + BitSet32 load_pairs; + BitSet32 store_pairs; +#endif }; using CodeBuffer = std::vector; @@ -128,7 +136,7 @@ struct CodeBlock // Which GQRs this block modifies, if any. BitSet8 m_gqr_modified; - // Which GPRs this block reads from before defining, if any. + // Which GPRs this block reads from before writing to, if any. BitSet32 m_gpr_inputs; // Which memory locations are occupied by this block. @@ -217,4 +225,20 @@ bool AnalyzeFunction(const Core::CPUThreadGuard& guard, u32 startAddr, Common::S bool ReanalyzeFunction(const Core::CPUThreadGuard& guard, u32 start_addr, Common::Symbol& func, u32 max_size = 0); +// The below functions are for internal use, but are exposed for the sake of unit tests. + +// For each even-length run of bits in candidates & mask, unsets those bits in candidates, and +// sets every second corresponding bit in out (corresponding to the first half of each pair). +// Returns the number of bits that were removed from candidates. +// +// Odd-length runs are left for later, because how to assign pairs is only unambiguous for +// even-length runs. To arbitrarily choose an assignment of pairs for odd-length runs, +// call OddLengthRunsToEvenLengthRuns after calling this, then call this again. +size_t FindRegisterPairs(BitSet32* candidates, BitSet32* out, BitSet32 mask = BitSet32(0xFFFFFFFF)); + +// Discards one bit from each run of bits, turning odd-length runs into even-length runs. +// (Also turns even-length runs into odd-length runs, which is probably not something you want. +// You should remove all even-length runs before calling this.) +void OddLengthRunsToEvenLengthRuns(BitSet32* candidates); + } // namespace PPCAnalyst diff --git a/Source/UnitTests/Core/CMakeLists.txt b/Source/UnitTests/Core/CMakeLists.txt index 865064ed68d..d56f99702ea 100644 --- a/Source/UnitTests/Core/CMakeLists.txt +++ b/Source/UnitTests/Core/CMakeLists.txt @@ -23,6 +23,7 @@ if(_M_X86_64) PowerPC/DivUtilsTest.cpp PowerPC/Jit64Common/ConvertDoubleToSingle.cpp PowerPC/Jit64Common/Frsqrte.cpp + PowerPC/PPCAnalystTest.cpp ) elseif(_M_ARM_64) add_dolphin_test(PowerPCTest @@ -32,10 +33,12 @@ elseif(_M_ARM_64) PowerPC/JitArm64/Fres.cpp PowerPC/JitArm64/Frsqrte.cpp PowerPC/JitArm64/MovI2R.cpp + PowerPC/PPCAnalystTest.cpp ) else() add_dolphin_test(PowerPCTest PowerPC/DivUtilsTest.cpp + PowerPC/PPCAnalystTest.cpp ) endif() diff --git a/Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp b/Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp new file mode 100644 index 00000000000..432cea1cd22 --- /dev/null +++ b/Source/UnitTests/Core/PowerPC/PPCAnalystTest.cpp @@ -0,0 +1,57 @@ +// Copyright 2021 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "Common/BitSet.h" +#include "Core/PowerPC/PPCAnalyst.h" + +TEST(PPCAnalyst, FindRegisterPairs) +{ + BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23}; + BitSet32 output{15, 18, 30}; + BitSet32 input_expected{1, 6, 7, 8, 19, 20, 21, 22, 23}; + BitSet32 output_expected{3, 11, 13, 15, 18, 30}; + + PPCAnalyst::FindRegisterPairs(&input, &output); + + ASSERT_EQ(input, input_expected); + ASSERT_EQ(output, output_expected); +} + +TEST(PPCAnalyst, FindRegisterPairs_AllOnes) +{ + BitSet32 input(0xFFFFFFFF); + BitSet32 output(0x0); + BitSet32 input_expected(0x0); + BitSet32 output_expected(0x55555555); + + PPCAnalyst::FindRegisterPairs(&input, &output); + + ASSERT_EQ(input, input_expected); + ASSERT_EQ(output, output_expected); +} + +TEST(PPCAnalyst, FindRegisterPairs_Masked) +{ + BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23}; + BitSet32 output{15, 18, 30}; + BitSet32 mask{1, 8, 9, 21, 22}; + BitSet32 input_expected{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 23}; + BitSet32 output_expected{15, 18, 21, 30}; + + PPCAnalyst::FindRegisterPairs(&input, &output, mask); + + ASSERT_EQ(input, input_expected); + ASSERT_EQ(output, output_expected); +} + +TEST(PPCAnalyst, OddLengthRunsToEvenLengthRuns) +{ + BitSet32 input{1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 19, 20, 21, 22, 23}; + BitSet32 expected{3, 6, 7, 11, 12, 13, 19, 20, 21, 22}; + + PPCAnalyst::OddLengthRunsToEvenLengthRuns(&input); + + ASSERT_EQ(input, expected); +} diff --git a/Source/UnitTests/UnitTests.vcxproj b/Source/UnitTests/UnitTests.vcxproj index 698e33bc37a..d65f8a3e74c 100644 --- a/Source/UnitTests/UnitTests.vcxproj +++ b/Source/UnitTests/UnitTests.vcxproj @@ -74,6 +74,7 @@ +