From f1411a19ad8952bc72ef5d3f05daab777fa53ae1 Mon Sep 17 00:00:00 2001 From: jbm11208 <81182113+jbm11208@users.noreply.github.com> Date: Sat, 17 Jan 2026 22:29:22 -0500 Subject: [PATCH] OPENGL WORKS (with async presentation and async shader compilation disabled) --- .../configuration/configure_graphics.cpp | 4 +- src/video_core/gpu.cpp | 147 ++++++++++++++---- src/video_core/gpu.h | 26 +++- src/video_core/gpu_command_queue.cpp | 23 ++- src/video_core/gpu_command_queue.h | 31 +++- src/video_core/gpu_impl.h | 8 +- 6 files changed, 190 insertions(+), 49 deletions(-) diff --git a/src/citra_qt/configuration/configure_graphics.cpp b/src/citra_qt/configuration/configure_graphics.cpp index 02d9cb879..14d761a94 100644 --- a/src/citra_qt/configuration/configure_graphics.cpp +++ b/src/citra_qt/configuration/configure_graphics.cpp @@ -163,8 +163,8 @@ void ConfigureGraphics::ApplyConfiguration() { ui->toggle_async_shaders, async_shader_compilation); ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_presentation, ui->toggle_async_present, async_presentation); - ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_gpu, - ui->toggle_async_gpu, async_gpu); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_gpu, ui->toggle_async_gpu, + async_gpu); ConfigurationShared::ApplyPerGameSetting(&Settings::values.spirv_shader_gen, ui->spirv_shader_gen, spirv_shader_gen); ConfigurationShared::ApplyPerGameSetting(&Settings::values.disable_spirv_optimizer, diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index bda786d9a..4bd4c350f 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -8,6 +8,7 @@ #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" +#include "core/frontend/emu_window.h" #include "core/hle/service/gsp/gsp_gpu.h" #include "core/hle/service/plgldr/plgldr.h" #include "video_core/debug_utils/debug_utils.h" @@ -32,8 +33,8 @@ MICROPROFILE_DEFINE(GPU_CmdlistProcessing, "GPU", "Cmdlist Processing", MP_RGB(1 GPU::GPU(Core::System& system, Frontend::EmuWindow& emu_window, Frontend::EmuWindow* secondary_window) - : right_eye_disabler{std::make_unique(*this)}, - impl{std::make_unique(system, emu_window, secondary_window)} { + : impl{std::make_unique(system, emu_window, secondary_window)}, + right_eye_disabler{std::make_unique(*this)} { impl->vblank_event = impl->timing.RegisterEvent( "GPU::VBlankCallback", [this](uintptr_t user_data, s64 cycles_late) { VBlankCallback(user_data, cycles_late); }); @@ -43,11 +44,14 @@ GPU::GPU(Core::System& system, Frontend::EmuWindow& emu_window, impl->pica.BindRasterizer(impl->rasterizer); // Initialize GPU command queue if async GPU is enabled. - // Note: Async GPU is disabled for Vulkan as it causes threading issues with command buffer - // recording. + // Note: Async GPU is disabled for Vulkan as it causes threading issues. if (Settings::values.async_gpu.GetValue() && Settings::values.graphics_api.GetValue() != Settings::GraphicsAPI::Vulkan) { - impl->command_queue = std::make_unique(*this); + auto shared_context = emu_window.CreateSharedContext(); + if (shared_context) { + impl->command_queue = + std::make_unique(*this, std::move(shared_context)); + } } } @@ -95,18 +99,12 @@ void GPU::ClearAll(bool flush) { } void GPU::Execute(const Service::GSP::Command& command) { - // If async GPU is enabled, queue the command; otherwise execute it directly - if (impl->command_queue) { - impl->command_queue->QueueCommand(command); - } else { - ExecuteCommand(command); - } + QueueInternalCommand(command); } void GPU::ExecuteCommand(const Service::GSP::Command& command) { using Service::GSP::CommandId; auto& regs = impl->pica.regs; - switch (command.id) { case CommandId::RequestDma: { impl->system.Memory().RasterizerFlushVirtualRegion( @@ -257,7 +255,14 @@ u32 GPU::ReadReg(VAddr addr) { const u32 index = offset / sizeof(u32); ASSERT(addr % sizeof(u32) == 0); ASSERT(index < Pica::PicaCore::Regs::NUM_REGS); - return impl->pica.regs.reg_array[index]; + + // Protect GPU register reads with mutex only when async GPU is enabled + if (impl->command_queue) { + std::lock_guard lock(impl->rasterizer_mutex); + return impl->pica.regs.reg_array[index]; + } else { + return impl->pica.regs.reg_array[index]; + } } default: UNREACHABLE_MSG("Read from unknown GPU address {:#08X}", addr); @@ -281,27 +286,66 @@ void GPU::WriteReg(VAddr addr, u32 data) { ASSERT(addr % sizeof(u32) == 0); ASSERT(index < Pica::PicaCore::Regs::NUM_REGS); - impl->pica.regs.reg_array[index] = data; - // Handle registers that trigger GPU actions - switch (index) { - case GPU_REG_INDEX(memory_fill_config[0].trigger): - MemoryFill(0, 0); - break; - case GPU_REG_INDEX(memory_fill_config[1].trigger): - MemoryFill(1, 1); - break; - case GPU_REG_INDEX(display_transfer_config.trigger): - MemoryTransfer(); - break; - case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[0]): - SubmitCmdList(0); - break; - case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[1]): - SubmitCmdList(1); - break; - default: - break; + if (impl->command_queue) { + // Async GPU path: protect with mutex and execute operations + std::lock_guard lock(impl->rasterizer_mutex); + impl->pica.regs.reg_array[index] = data; + + // Handle registers that trigger GPU actions asynchronously + switch (index) { + case GPU_REG_INDEX(memory_fill_config[0].trigger): + if (data) + MemoryFill(0, 0); + break; + case GPU_REG_INDEX(memory_fill_config[1].trigger): + if (data) + MemoryFill(1, 1); + break; + case GPU_REG_INDEX(display_transfer_config.trigger): + if (data) + MemoryTransfer(); + break; + case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[0]): + if (data) + SubmitCmdList(0); + break; + case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[1]): + if (data) + SubmitCmdList(1); + break; + default: + break; + } + } else { + // Synchronous GPU path: no mutex needed, execute directly + impl->pica.regs.reg_array[index] = data; + + // Handle registers that trigger GPU actions synchronously + switch (index) { + case GPU_REG_INDEX(memory_fill_config[0].trigger): + if (data) + MemoryFill(0, 0); + break; + case GPU_REG_INDEX(memory_fill_config[1].trigger): + if (data) + MemoryFill(1, 1); + break; + case GPU_REG_INDEX(display_transfer_config.trigger): + if (data) + MemoryTransfer(); + break; + case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[0]): + if (data) + SubmitCmdList(0); + break; + case GPU_REG_INDEX(internal.pipeline.command_buffer.trigger[1]): + if (data) + SubmitCmdList(1); + break; + default: + break; + } } break; } @@ -350,6 +394,33 @@ void GPU::ReportLoadingProgramID(u64 program_ID) { impl->rasterizer->SetAccurateMul(use_accurate_mul); } +void GPU::WaitForGPUCompletion() { + if (impl->command_queue) { + impl->command_queue->WaitForIdle(); + } +} + +bool GPU::IsGPUCommandQueueIdle() const { + if (impl->command_queue) { + return impl->command_queue->IsIdle(); + } + return true; +} + +void GPU::SignalGPUFlush() { + if (impl->command_queue) { + impl->command_queue->SignalFlush(); + } +} + +void GPU::QueueInternalCommand(const Service::GSP::Command& command) { + if (impl->command_queue) { + impl->command_queue->QueueCommand(command); + } else { + ExecuteCommand(command); + } +} + void GPU::SubmitCmdList(u32 index) { // Check if a command list was triggered. auto& config = impl->pica.regs.internal.pipeline.command_buffer; @@ -429,7 +500,15 @@ void GPU::MemoryTransfer() { } void GPU::VBlankCallback(std::uintptr_t user_data, s64 cycles_late) { - // Present renderered frame. + // Signal GPU to flush any pending work before presenting. + // Use non-blocking signal because VBlank happens on timing thread and cannot block. + // The GPU worker thread will process queued commands and complete them before + // the next frame if there's time, enabling proper async operation. + if (impl->command_queue) { + impl->command_queue->SignalFlush(); + } + + // Present rendered frame. impl->renderer->SwapBuffers(); // Signal to GSP that GPU interrupt has occurred diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index e598f2c23..d6cec6b2c 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -38,6 +38,7 @@ constexpr u64 FRAME_TICKS = 4481136ull; class GraphicsDebugger; class RendererBase; class RightEyeDisabler; +class GPUCommandQueue; /** * The GPU class is the high level interface to the video_core for core services. @@ -78,6 +79,10 @@ public: /// Writes the provided value to the GPU virtual address. void WriteReg(VAddr addr, u32 data); + /// Queues a synthetic GPU command for an internally-triggered operation + /// Used when WriteReg triggers GPU actions that should be async + void QueueInternalCommand(const Service::GSP::Command& command); + /// Returns a mutable reference to the renderer. [[nodiscard]] VideoCore::RendererBase& Renderer(); @@ -99,6 +104,23 @@ public: void ReportLoadingProgramID(u64 program_ID); + /// Waits for all pending GPU commands to complete. + /// This should ONLY be called in critical sections where game logic depends on GPU results. + /// Normal rendering does not require this call - it happens asynchronously. + /// Examples: Memory reads after transfers, register reads that reflect GPU state + void WaitForGPUCompletion(); + + /// Check if GPU command queue is idle (non-blocking check) + [[nodiscard]] bool IsGPUCommandQueueIdle() const; + + /// Signal GPU to flush pending work (non-blocking). + /// Used at frame boundaries where timing thread cannot block. + void SignalGPUFlush(); + + // Allow GPUCommandQueue to access implementation details + struct Impl; + std::unique_ptr impl; + private: void SubmitCmdList(u32 index); @@ -113,12 +135,12 @@ private: template void serialize(Archive& ar, const u32 file_version); + friend class GPUCommandQueue; // Allow access to impl for rasterizer mutex + std::unique_ptr right_eye_disabler; private: friend class RightEyeDisabler; - struct Impl; - std::unique_ptr impl; PAddr VirtualToPhysicalAddress(VAddr addr); }; diff --git a/src/video_core/gpu_command_queue.cpp b/src/video_core/gpu_command_queue.cpp index de637f7ff..fc0638d49 100644 --- a/src/video_core/gpu_command_queue.cpp +++ b/src/video_core/gpu_command_queue.cpp @@ -6,12 +6,15 @@ #include #include #include "common/logging/log.h" +#include "core/frontend/emu_window.h" #include "video_core/gpu.h" #include "video_core/gpu_command_queue.h" +#include "video_core/gpu_impl.h" namespace VideoCore { -GPUCommandQueue::GPUCommandQueue(GPU& gpu) : gpu{gpu} { +GPUCommandQueue::GPUCommandQueue(GPU& gpu, std::unique_ptr context) + : gpu{gpu}, graphics_context{std::move(context)} { worker_thread = std::make_unique([this] { ProcessCommandQueue(); }); } @@ -33,6 +36,17 @@ void GPUCommandQueue::WaitForIdle() { idle_cv.wait(lock, [this] { return is_idle; }); } +void GPUCommandQueue::SignalFlush() { + // Non-blocking signal that GPU should flush pending work + // Used at frame boundaries where we can't block the timing thread + { + std::lock_guard lock(queue_mutex); + // Just ensure the worker is awake to process any remaining commands + // Don't wait for completion + } + queue_cv.notify_one(); +} + void GPUCommandQueue::Shutdown() { { std::lock_guard lock(queue_mutex); @@ -51,6 +65,9 @@ bool GPUCommandQueue::IsIdle() const { } void GPUCommandQueue::ProcessCommandQueue() { + // Execute queued commands on a dedicated worker thread. + // Rasterizer access is protected by a mutex to ensure thread safety. + while (true) { Service::GSP::Command command; bool has_command = false; @@ -72,8 +89,10 @@ void GPUCommandQueue::ProcessCommandQueue() { } } - // Process the command outside the lock - no artificial delays + // Process the command outside the queue lock but with rasterizer lock held if (has_command) { + // Hold rasterizer mutex while executing to prevent races with main thread + std::lock_guard rasterizer_lock(gpu.impl->rasterizer_mutex); gpu.ExecuteCommand(command); // Check if queue is now idle after processing this command diff --git a/src/video_core/gpu_command_queue.h b/src/video_core/gpu_command_queue.h index 015c065c4..f805f254c 100644 --- a/src/video_core/gpu_command_queue.h +++ b/src/video_core/gpu_command_queue.h @@ -13,35 +13,49 @@ #include "common/common_types.h" #include "core/hle/service/gsp/gsp_gpu.h" +namespace Frontend { +class GraphicsContext; +} + namespace VideoCore { class GPU; /** * GPU Command Queue for asynchronous GPU command processing. - * Processes GPU commands on a dedicated worker thread, similar to real 3DS hardware. + * Processes GPU commands on a dedicated worker thread with shared OpenGL context. * * Design principles: - * - No artificial delays or busy-waiting - * - Worker thread sleeps when queue is empty (OS scheduler handles CPU allocation) - * - Logic thread not blocked when rendering - * - Efficient synchronization with condition variables + * - Main thread queues GPU commands without blocking + * - Worker thread executes with shared GL context (context sharing via frontend) + * - Rasterizer cache protected by mutex for thread safety + * - Game logic runs parallel to GPU work, enabling dynamic FPS + * + * Why this works: + * - OpenGL supports context sharing: worker thread gets shared context + * - GPU objects (shaders, textures) are shared across contexts + * - Rasterizer mutex prevents cache races + * - Game logic only waits when explicitly reading GPU results */ class GPUCommandQueue { public: - explicit GPUCommandQueue(GPU& gpu); + explicit GPUCommandQueue(GPU& gpu, std::unique_ptr context); ~GPUCommandQueue(); /// Queue a GPU command for processing void QueueCommand(const Service::GSP::Command& command); - /// Wait for all queued commands to be processed + /// Wait for all queued commands to be processed (BLOCKING - use sparingly) void WaitForIdle(); + /// Non-blocking flush: Signals GPU to complete pending work but doesn't wait + /// Used for frame boundaries where we can't block the timing thread + void SignalFlush(); + /// Shutdown the command queue and worker thread void Shutdown(); - /// Check if the queue is idle + /// Check if the queue is idle (non-blocking check) [[nodiscard]] bool IsIdle() const; private: @@ -49,6 +63,7 @@ private: void ProcessCommandQueue(); GPU& gpu; + std::unique_ptr graphics_context; std::queue command_queue; mutable std::mutex queue_mutex; std::condition_variable queue_cv; diff --git a/src/video_core/gpu_impl.h b/src/video_core/gpu_impl.h index 073fa2917..ac4d08eef 100644 --- a/src/video_core/gpu_impl.h +++ b/src/video_core/gpu_impl.h @@ -13,9 +13,9 @@ #include "core/hle/service/plgldr/plgldr.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/gpu.h" +#include "video_core/gpu_command_queue.h" #include "video_core/gpu_debugger.h" #include "video_core/gpu_impl.h" -#include "video_core/gpu_command_queue.h" #include "video_core/pica/pica_core.h" #include "video_core/pica/regs_lcd.h" #include "video_core/renderer_base.h" @@ -23,6 +23,8 @@ #include "video_core/right_eye_disabler.h" #include "video_core/video_core.h" +#include + namespace VideoCore { struct GPU::Impl { Core::Timing& timing; @@ -38,6 +40,10 @@ struct GPU::Impl { Core::TimingEventType* vblank_event; Service::GSP::InterruptHandler signal_interrupt; + // Mutex to protect rasterizer access when async GPU is enabled + // Ensures cache consistency when accessed from multiple threads + mutable std::mutex rasterizer_mutex; + explicit Impl(Core::System& system, Frontend::EmuWindow& emu_window, Frontend::EmuWindow* secondary_window) : timing{system.CoreTiming()}, system{system}, memory{system.Memory()},