From 91128d662504cc3028fc39ef68e4538fb5a6f65e Mon Sep 17 00:00:00 2001 From: Wunk Date: Fri, 24 Apr 2026 11:34:46 -0700 Subject: [PATCH] shader_jit: Emit `LG2`/`EX2` subroutines on-demand (#2046) Rather than emitting these subroutine functions for _every_ shader, only emit the subroutines when the `LG2` and `EX2` instructions are actually used. This saves a good chunk of memory across all shaders. Inspired by Tanuki3DS. --- .../shader/shader_jit_a64_compiler.cpp | 31 ++++++++----------- .../shader/shader_jit_a64_compiler.h | 11 ++++--- .../shader/shader_jit_x64_compiler.cpp | 30 ++++++++---------- .../shader/shader_jit_x64_compiler.h | 11 ++++--- 4 files changed, 40 insertions(+), 43 deletions(-) diff --git a/src/video_core/shader/shader_jit_a64_compiler.cpp b/src/video_core/shader/shader_jit_a64_compiler.cpp index 90c8425bc..e138857c0 100644 --- a/src/video_core/shader/shader_jit_a64_compiler.cpp +++ b/src/video_core/shader/shader_jit_a64_compiler.cpp @@ -508,6 +508,7 @@ void JitShader::Compile_DPH(Instruction instr) { void JitShader::Compile_EX2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); STR(X30, SP, POST_INDEXED, -16); + exp2_used = true; BL(exp2_subroutine); LDR(X30, SP, PRE_INDEXED, 16); Compile_DestEnable(instr, SRC1); @@ -516,6 +517,7 @@ void JitShader::Compile_EX2(Instruction instr) { void JitShader::Compile_LG2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); STR(X30, SP, POST_INDEXED, -16); + log2_used = true; BL(log2_subroutine); LDR(X30, SP, PRE_INDEXED, 16); Compile_DestEnable(instr, SRC1); @@ -994,6 +996,14 @@ void JitShader::Compile(const std::array* program_ // Compile entire program Compile_Block(static_cast(program_code->size())); + // Compile utility functions + if (log2_used) { + Compile_Log2(log2_subroutine); + } + if (exp2_used) { + Compile_Exp2(exp2_subroutine); + } + // Free memory that's no longer needed program_code = nullptr; swizzle_data = nullptr; @@ -1021,18 +1031,9 @@ void JitShader::Compile(const std::array* program_ code_vec.shrink_to_fit(); } -JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) { - CompilePrelude(); -} - -void JitShader::CompilePrelude() { - log2_subroutine = CompilePrelude_Log2(); - exp2_subroutine = CompilePrelude_Exp2(); -} - -Label JitShader::CompilePrelude_Log2() { - Label subroutine; +JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {} +void JitShader::Compile_Log2(Label subroutine) { // We perform this approximation by first performing a range reduction into the range // [1.0, 2.0). A minimax polynomial which was fit for the function log2(x) / (x - 1) is then // evaluated. We multiply the result by (x - 1) then restore the result into the appropriate @@ -1136,13 +1137,9 @@ Label JitShader::CompilePrelude_Log2() { DUP(SRC1.S4(), SRC1.Selem()[0]); RET(); - - return subroutine; } -Label JitShader::CompilePrelude_Exp2() { - Label subroutine; - +void JitShader::Compile_Exp2(Label subroutine) { // This approximation first performs a range reduction into the range [-0.5, 0.5). A minmax // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the // result into the appropriate range. @@ -1241,8 +1238,6 @@ Label JitShader::CompilePrelude_Exp2() { DUP(SRC1.S4(), SRC1.Selem()[0]); RET(); - - return subroutine; } } // namespace Pica::Shader diff --git a/src/video_core/shader/shader_jit_a64_compiler.h b/src/video_core/shader/shader_jit_a64_compiler.h index 7accf66ac..becede149 100644 --- a/src/video_core/shader/shader_jit_a64_compiler.h +++ b/src/video_core/shader/shader_jit_a64_compiler.h @@ -1,4 +1,4 @@ -// Copyright 2023 Citra Emulator Project +// Copyright Citra Emulator Project / Azahar Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -123,9 +123,8 @@ private: /** * Emits data and code for utility functions. */ - void CompilePrelude(); - oaknut::Label CompilePrelude_Log2(); - oaknut::Label CompilePrelude_Exp2(); + void Compile_Log2(oaknut::Label subroutine); + void Compile_Exp2(oaknut::Label subroutine); const std::array* program_code = nullptr; const std::array* swizzle_data = nullptr; @@ -146,6 +145,10 @@ private: using CompiledShader = void(const void* setup, void* state, const std::byte* start_addr); CompiledShader* program = nullptr; + /// Library functions, emitted as used + bool log2_used : 1 = false; + bool exp2_used : 1 = false; + oaknut::Label log2_subroutine; oaknut::Label exp2_subroutine; }; diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 1c1ff92b2..476a024e9 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -511,12 +511,14 @@ void JitShader::Compile_DPH(Instruction instr) { void JitShader::Compile_EX2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + exp2_used = true; call(exp2_subroutine); Compile_DestEnable(instr, SRC1); } void JitShader::Compile_LG2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + log2_used = true; call(log2_subroutine); Compile_DestEnable(instr, SRC1); } @@ -1038,6 +1040,14 @@ void JitShader::Compile(const std::array* program_ // Compile entire program Compile_Block(static_cast(program_code->size())); + // Compile utility functions + if (log2_used) { + Compile_Log2(log2_subroutine); + } + if (exp2_used) { + Compile_Exp2(exp2_subroutine); + } + // Free memory that's no longer needed program_code = nullptr; swizzle_data = nullptr; @@ -1050,18 +1060,9 @@ void JitShader::Compile(const std::array* program_ LOG_DEBUG(HW_GPU, "Compiled shader size={}", getSize()); } -JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) { - CompilePrelude(); -} - -void JitShader::CompilePrelude() { - log2_subroutine = CompilePrelude_Log2(); - exp2_subroutine = CompilePrelude_Exp2(); -} - -Xbyak::Label JitShader::CompilePrelude_Log2() { - Xbyak::Label subroutine; +JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} +void JitShader::Compile_Log2(Xbyak::Label subroutine) { // SSE does not have a log instruction, thus we must approximate. // We perform this approximation first performaing a range reduction into the range [1.0, 2.0). // A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated. @@ -1163,12 +1164,9 @@ Xbyak::Label JitShader::CompilePrelude_Log2() { shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); ret(); - - return subroutine; } -Xbyak::Label JitShader::CompilePrelude_Exp2() { - Xbyak::Label subroutine; +void JitShader::Compile_Exp2(Xbyak::Label subroutine) { // SSE does not have a exp instruction, thus we must approximate. // We perform this approximation first performaing a range reduction into the range [-0.5, 0.5). @@ -1271,8 +1269,6 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() { shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); ret(); - - return subroutine; } } // namespace Pica::Shader diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h index 911183296..aff9cae7b 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -1,4 +1,4 @@ -// Copyright 2015 Citra Emulator Project +// Copyright Citra Emulator Project / Azahar Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -115,9 +115,8 @@ private: /** * Emits data and code for utility functions. */ - void CompilePrelude(); - Xbyak::Label CompilePrelude_Log2(); - Xbyak::Label CompilePrelude_Exp2(); + void Compile_Log2(Xbyak::Label subroutine); + void Compile_Exp2(Xbyak::Label subroutine); const std::array* program_code = nullptr; const std::array* swizzle_data = nullptr; @@ -138,6 +137,10 @@ private: using CompiledShader = void(const void* setup, void* state, const u8* start_addr); CompiledShader* program = nullptr; + /// Library functions, emitted as used + bool log2_used : 1 = false; + bool exp2_used : 1 = false; + Xbyak::Label log2_subroutine; Xbyak::Label exp2_subroutine; };