Vulkan: Properly shutdown pipeline compile threads + code cleanup

This fixes an issue where the Vulkan renderer would hang up on shutdown due to resources still being in use
2026-07-09 17:14:47 -06:00 · 2026-02-09 04:11:10 +01:00 · 2026-02-09 04:11:10 +01:00 · 8cd5ce102f
commit 8cd5ce102f
parent 2c03ac3217
6 changed files with 90 additions and 77 deletions
--- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
@ -290,7 +290,6 @@ public:
 	{
 		if (m_hasCacheAlloc)
 		{
-			cemu_assert_debug(isInUse() == false);
 			g_gpuBufferHeap->freeOffset(m_cacheOffset);
 			m_hasCacheAlloc = false;
 		}
@ -836,6 +835,8 @@ public:
 				continue;
 			}
 			// delete range
+			if (node->m_hasCacheAlloc)
+				cemu_assert_debug(!node->isInUse());
 			node->ReleaseCacheMemoryImmediately();
 			LatteBufferCache_removeSingleNodeFromTree(node);
 			delete node;
--- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h
+++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h
@ -115,11 +115,7 @@ struct LatteDecompilerCFInstruction
 		cemu_assert_debug(!(instructionsALU.size() != 0 && instructionsTEX.size() != 0)); // make sure we haven't accidentally added the wrong instruction type
 	}

-#if BOOST_OS_WINDOWS
-	LatteDecompilerCFInstruction(LatteDecompilerCFInstruction& mE) = default;
-#else
 	LatteDecompilerCFInstruction(const LatteDecompilerCFInstruction& mE) = default;
-#endif
 	LatteDecompilerCFInstruction(LatteDecompilerCFInstruction&& mE) = default;

 	LatteDecompilerCFInstruction& operator=(LatteDecompilerCFInstruction&& mE) = default;
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineCompiler.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineCompiler.cpp
@ -6,11 +6,10 @@
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Cafe/OS/libs/gx2/GX2.h"
 #include "config/ActiveSettings.h"
+#include "util/helpers/helpers.h"
 #include "util/helpers/Serializer.h"
 #include "Cafe/HW/Latte/Common/RegisterSerializer.h"

-std::mutex s_nvidiaWorkaround;
-
 /* rects emulation */

 void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister)
@ -923,7 +922,6 @@ bool PipelineCompiler::InitFromCurrentGPUState(PipelineInfo* pipelineInfo, const
 	if (result != VK_SUCCESS)
 	{
 		cemuLog_log(LogType::Force, "Failed to create pipeline layout: {}", result);
-		s_nvidiaWorkaround.unlock();
 		return false;
 	}

@ -941,7 +939,7 @@ bool PipelineCompiler::InitFromCurrentGPUState(PipelineInfo* pipelineInfo, const

 	// increment ref counter for vkrObjPipeline and renderpass object to make sure they dont get released while we are using them
 	m_vkrObjPipeline->incRef();
-	renderPassObj->incRef();
+	m_renderPassObj->incRef();
 	return true;
 }

@ -1121,3 +1119,73 @@ bool PipelineCompiler::CalcRobustBufferAccessRequirement(LatteDecompilerShader*
 	}
 	return requiresRobustBufferAcces;
 }
+
+static std::vector<std::thread> s_compileThreads;
+static std::atomic_bool s_compileThreadsShutdownSignal{};
+static ConcurrentQueue<PipelineCompiler*> s_pipelineCompileRequests;
+
+static void compilePipeline_thread(sint32 threadIndex)
+{
+	SetThreadName("compilePl");
+#ifdef _WIN32
+	// to avoid starving the main cpu and render threads the pipeline compile threads run at lower priority
+	// except for one thread which we always run at normal priority to prevent the opposite scenario where all compile threads are starved
+	if(threadIndex != 0)
+		SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
+#endif
+	while (!s_compileThreadsShutdownSignal)
+	{
+		PipelineCompiler* request = s_pipelineCompileRequests.pop();
+		if (!request)
+			continue;
+		request->Compile(true, false, true);
+		delete request;
+	}
+}
+
+void PipelineCompiler::CompileThreadPool_Start()
+{
+	cemu_assert_debug(s_compileThreads.empty());
+	s_compileThreadsShutdownSignal = false;
+	uint32 numCompileThreads;
+
+	uint32 cpuCoreCount = GetPhysicalCoreCount();
+	if (cpuCoreCount <= 2)
+		numCompileThreads = 1;
+	else
+		numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3
+
+	numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8
+
+	for (uint32_t i = 0; i < numCompileThreads; i++)
+	{
+		s_compileThreads.emplace_back(compilePipeline_thread, i);
+	}
+}
+
+void PipelineCompiler::CompileThreadPool_Stop()
+{
+	s_compileThreadsShutdownSignal = true;
+	{
+		// push one empty workload for each thread
+		// this way we can make sure that each waiting thread is woken up to see the shutdown signal
+		for (auto& thread : s_compileThreads)
+			s_pipelineCompileRequests.push(nullptr);
+	}
+	for (auto& thread : s_compileThreads)
+		thread.join();
+	while (!s_pipelineCompileRequests.empty())
+	{
+		PipelineCompiler* pipelineCompiler = s_pipelineCompileRequests.pop();
+		if (!pipelineCompiler)
+			break;
+		if (pipelineCompiler)
+			delete pipelineCompiler;
+	}
+	s_compileThreads.clear();
+}
+
+void PipelineCompiler::CompileThreadPool_QueueCompilation(PipelineCompiler* v)
+{
+	s_pipelineCompileRequests.push(v);
+}
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineCompiler.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineCompiler.h
@ -1,4 +1,6 @@
 #pragma once
+#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
+#include "VKRBase.h"

 class PipelineCompiler : public VKRMoveableRefCounter
 {
@ -43,6 +45,11 @@ public:

 	static bool CalcRobustBufferAccessRequirement(LatteDecompilerShader* vertexShader, LatteDecompilerShader* pixelShader, LatteDecompilerShader* geometryShader);

+	// API for thread pool
+	static void CompileThreadPool_Start();
+	static void CompileThreadPool_Stop();
+	static void CompileThreadPool_QueueCompilation(PipelineCompiler* v);
+
 	VkPipelineLayout m_pipelineLayout;
 	VKRObjectRenderPass* m_renderPassObj{};
 	bool m_requestRobustBufferAccess{false};
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@ -4,6 +4,7 @@
 #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanTextureReadback.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.h"
+#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineCompiler.h"

 #include "Cafe/HW/Latte/Core/LatteBufferCache.h"
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
@ -653,7 +654,8 @@ VulkanRenderer::VulkanRenderer()
 		m_occlusionQueries.list_availableQueryIndices.emplace_back(i);

 	// start compilation threads
-	RendererShaderVk::Init();
+	RendererShaderVk::Init(); // shaders
+	PipelineCompiler::CompileThreadPool_Start(); // pipelines
 }

 VulkanRenderer::~VulkanRenderer()
@ -661,8 +663,6 @@ VulkanRenderer::~VulkanRenderer()
 	SubmitCommandBuffer();
 	WaitDeviceIdle();
 	WaitCommandBufferFinished(GetCurrentCommandBufferId());
-	// make sure compilation threads have been shut down
-	RendererShaderVk::Shutdown();
 	// shut down pipeline save thread
 	m_destructionRequested = true;
 	m_pipeline_cache_semaphore.notify();
@ -1666,6 +1666,10 @@ void VulkanRenderer::Shutdown()
 {
 	SubmitCommandBuffer();
 	WaitDeviceIdle();
+	// stop compilation threads
+	RendererShaderVk::Shutdown();
+	PipelineCompiler::CompileThreadPool_Stop();
+
 	DeleteFontTextures();
 	Renderer::Shutdown();
 	if (m_imguiRenderPass != VK_NULL_HANDLE)
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
@ -183,63 +183,6 @@ void VulkanRenderer::unregisterGraphicsPipeline(PipelineInfo* pipelineInfo)
 	}
 }

-bool g_compilePipelineThreadInit{false};
-std::mutex g_compilePipelineMutex;
-std::condition_variable g_compilePipelineCondVar;
-std::queue<PipelineCompiler*> g_compilePipelineRequests;
-
-void compilePipeline_thread(sint32 threadIndex)
-{
-	SetThreadName("compilePl");
-#ifdef _WIN32
-	// one thread runs at normal priority while the others run at lower priority
-	if(threadIndex != 0)
-		SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
-#endif
-	while (true)
-	{
-		std::unique_lock lock(g_compilePipelineMutex);
-		while (g_compilePipelineRequests.empty())
-			g_compilePipelineCondVar.wait(lock);
-
-		PipelineCompiler* request = g_compilePipelineRequests.front();
-
-		g_compilePipelineRequests.pop();
-
-		lock.unlock();
-
-		request->Compile(true, false, true);
-		delete request;
-	}
-}
-
-void compilePipelineThread_init()
-{
-	uint32 numCompileThreads;
-
-	uint32 cpuCoreCount = GetPhysicalCoreCount();
-	if (cpuCoreCount <= 2)
-		numCompileThreads = 1;
-	else
-		numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3
-
-	numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8
-
-	for (uint32_t i = 0; i < numCompileThreads; i++)
-	{
-		std::thread compileThread(compilePipeline_thread, i);
-		compileThread.detach();
-	}
-}
-
-void compilePipelineThread_queue(PipelineCompiler* v)
-{
-	std::unique_lock lock(g_compilePipelineMutex);
-	g_compilePipelineRequests.push(std::move(v));
-	lock.unlock();
-	g_compilePipelineCondVar.notify_one();
-}
-
 // make a guess if a pipeline is not essential
 // non-essential means that skipping these drawcalls shouldn't lead to permanently corrupted graphics
 bool VulkanRenderer::IsAsyncPipelineAllowed(uint32 numIndices)
@ -270,12 +213,6 @@ bool VulkanRenderer::IsAsyncPipelineAllowed(uint32 numIndices)
 // create graphics pipeline for current state
 PipelineInfo* VulkanRenderer::draw_createGraphicsPipeline(uint32 indexCount)
 {
-	if (!g_compilePipelineThreadInit)
-	{
-		compilePipelineThread_init();
-		g_compilePipelineThreadInit = true;
-	}
-
 	const auto fetchShader = LatteSHRC_GetActiveFetchShader();
 	const auto vertexShader = LatteSHRC_GetActiveVertexShader();
 	const auto geometryShader = LatteSHRC_GetActiveGeometryShader();
@ -313,7 +250,7 @@ PipelineInfo* VulkanRenderer::draw_createGraphicsPipeline(uint32 indexCount)
 		if (pipelineCompiler->Compile(false, true, true) == false)
 		{
 			// shaders or pipeline not cached -> asynchronous compilation
-			compilePipelineThread_queue(pipelineCompiler);
+			PipelineCompiler::CompileThreadPool_QueueCompilation(pipelineCompiler);
 		}
 		else
 		{
@ -379,7 +316,7 @@ float s_vkUniformData[512 * 4];
 uint32 VulkanRenderer::uniformData_uploadUniformDataBufferGetOffset(std::span<uint8> data)
 {
 	const uint32 bufferAlignmentM1 = std::max(m_featureControl.limits.minUniformBufferOffsetAlignment, m_featureControl.limits.nonCoherentAtomSize) - 1;
-	const uint32 uniformSize = (data.size() + bufferAlignmentM1) & ~bufferAlignmentM1;
+	const uint32 uniformSize = ((uint32)data.size() + bufferAlignmentM1) & ~bufferAlignmentM1;

 	auto waitWhileCondition = [&](std::function<bool()> condition) {
 		while (condition())