From c279583f1f360ffb0641afeb6f3b468573643a1d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 5 Apr 2026 23:23:02 +0300 Subject: [PATCH 01/11] vk: Allow cubemap unwrap to generate more than 1 mipmap level --- rpcs3/Emu/RSX/VK/VKDraw.cpp | 4 ++++ rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index 5c3737fdf2..f7dffc3029 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -471,6 +471,10 @@ void VKGSRender::load_texture_env() // Clamp min and max lod actual_mipmaps = static_cast(sampler_state->external_subresource_desc.sections_to_copy.size()); } + else if (sampler_state->external_subresource_desc.op == rsx::deferred_request_command::cubemap_unwrap) + { + actual_mipmaps = static_cast(sampler_state->external_subresource_desc.mipmaps); + } else { actual_mipmaps = 1.f; diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index c9d9599b9e..206903bf64 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -761,8 +761,9 @@ namespace vk const rsx::simple_array& sections_to_copy, const rsx::texture_channel_remap_t& remap_vector) { auto _template = get_template_from_collection_impl(sections_to_copy); + const u8 mip_count = 1 + sections_to_copy.reduce(0, FN(std::max(x, y.level))); auto result = create_temporary_subresource_view_impl(cmd, _template, VK_IMAGE_TYPE_2D, - VK_IMAGE_VIEW_TYPE_CUBE, gcm_format, 0, 0, size, size, 1, 1, remap_vector, false); + VK_IMAGE_VIEW_TYPE_CUBE, gcm_format, 0, 0, size, size, 1, mip_count, remap_vector, false); if (!result) { @@ -772,7 +773,7 @@ namespace vk const auto image = result->image(); VkImageAspectFlags dst_aspect = vk::get_aspect_flags(result->info.format); - VkImageSubresourceRange dst_range = { dst_aspect, 0, 1, 0, 6 }; + VkImageSubresourceRange dst_range = { dst_aspect, 0, mip_count, 0, 6 }; vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, dst_range); if (!(dst_aspect & VK_IMAGE_ASPECT_DEPTH_BIT)) From b700a7abd9201ad6227903f6fcc7a31f545470a3 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 5 Apr 2026 23:35:29 +0300 Subject: [PATCH 02/11] gl: Extend mipcount support for reconstructed images --- rpcs3/Emu/RSX/GL/GLDraw.cpp | 16 +++++++++++++++- rpcs3/Emu/RSX/GL/GLTextureCache.h | 3 ++- rpcs3/Emu/RSX/GL/glutils/sampler.cpp | 5 ++--- rpcs3/Emu/RSX/GL/glutils/sampler.h | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLDraw.cpp b/rpcs3/Emu/RSX/GL/GLDraw.cpp index d0c2e233e9..0abf0111e6 100644 --- a/rpcs3/Emu/RSX/GL/GLDraw.cpp +++ b/rpcs3/Emu/RSX/GL/GLDraw.cpp @@ -384,7 +384,21 @@ void GLGSRender::load_texture_env() } } - m_fs_sampler_states[i].apply(tex, fs_sampler_state[i].get()); + u32 actual_mipcount = 1; + if (sampler_state->upload_context == rsx::texture_upload_context::shader_read) + { + actual_mipcount = tex.get_exact_mipmap_count(); + } + else if (sampler_state->external_subresource_desc.op == rsx::deferred_request_command::mipmap_gather) + { + actual_mipcount = sampler_state->external_subresource_desc.sections_to_copy.size(); + } + else if (sampler_state->external_subresource_desc.op == rsx::deferred_request_command::cubemap_unwrap) + { + actual_mipcount = sampler_state->external_subresource_desc.mipmaps; + } + + m_fs_sampler_states[i].apply(tex, fs_sampler_state[i].get(), actual_mipcount > 1); const auto texture_format = sampler_state->format_ex.format(); // Depth format redirected to BGRA8 resample stage. Do not filter to avoid bits leaking. diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 27b455374e..93c0ba2f5c 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -586,7 +586,8 @@ namespace gl gl::texture_view* generate_cubemap_from_images(gl::command_context& cmd, u32 gcm_format, u16 size, const rsx::simple_array& sources, const rsx::texture_channel_remap_t& remap_vector) override { auto _template = get_template_from_collection_impl(sources); - auto result = create_temporary_subresource_impl(cmd, _template, GL_NONE, GL_TEXTURE_CUBE_MAP, gcm_format, 0, 0, size, size, 1, 1, remap_vector, false); + const u8 mip_count = 1 + sources.reduce(0, FN(std::max(x, y.level))); + auto result = create_temporary_subresource_impl(cmd, _template, GL_NONE, GL_TEXTURE_CUBE_MAP, gcm_format, 0, 0, size, size, 1, mip_count, remap_vector, false); copy_transfer_regions_impl(cmd, result->image(), sources); return result; diff --git a/rpcs3/Emu/RSX/GL/glutils/sampler.cpp b/rpcs3/Emu/RSX/GL/glutils/sampler.cpp index 387228983c..4b1b603fc6 100644 --- a/rpcs3/Emu/RSX/GL/glutils/sampler.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/sampler.cpp @@ -72,7 +72,7 @@ namespace gl } // Apply sampler state settings - void sampler_state::apply(const rsx::fragment_texture& tex, const rsx::sampled_image_descriptor_base* sampled_image) + void sampler_state::apply(const rsx::fragment_texture& tex, const rsx::sampled_image_descriptor_base* sampled_image, bool allow_mipmaps) { set_parameteri(GL_TEXTURE_WRAP_S, wrap_mode(tex.wrap_s())); set_parameteri(GL_TEXTURE_WRAP_T, wrap_mode(tex.wrap_t())); @@ -114,8 +114,7 @@ namespace gl } } - if (sampled_image->upload_context != rsx::texture_upload_context::shader_read || - tex.get_exact_mipmap_count() == 1) + if (!allow_mipmaps || tex.get_exact_mipmap_count() == 1) { GLint min_filter = tex_min_filter(tex.min_filter()); diff --git a/rpcs3/Emu/RSX/GL/glutils/sampler.h b/rpcs3/Emu/RSX/GL/glutils/sampler.h index 89200915f8..8e8482f196 100644 --- a/rpcs3/Emu/RSX/GL/glutils/sampler.h +++ b/rpcs3/Emu/RSX/GL/glutils/sampler.h @@ -75,7 +75,7 @@ namespace gl return (prop == m_propertiesf.end()) ? 0 : prop->second; } - void apply(const rsx::fragment_texture& tex, const rsx::sampled_image_descriptor_base* sampled_image); + void apply(const rsx::fragment_texture& tex, const rsx::sampled_image_descriptor_base* sampled_image, bool allow_mipmaps = true); void apply(const rsx::vertex_texture& tex, const rsx::sampled_image_descriptor_base* sampled_image); void apply_defaults(GLenum default_filter = GL_NEAREST); From 34c26eff68c948ff4a6520e886badcc2b58ddad0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 5 Apr 2026 23:36:27 +0300 Subject: [PATCH 03/11] rsx: Extend cubemap_unwrap decode to handle cubemaps with mipmaps --- rpcs3/Emu/RSX/Common/texture_cache.h | 42 +++++++++++++++++----------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 8aed0ccc34..295090a5bc 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -1731,24 +1731,34 @@ namespace rsx } case deferred_request_command::cubemap_unwrap: { - rsx::simple_array sections(6); - for (u16 n = 0; n < 6; ++n) + rsx::simple_array sections(6 * desc.mipmaps); + for (u16 n = 0, section_id = 0; n < 6; ++n) { - sections[n] = + u16 mip_w = desc.width, mip_h = desc.height; + u16 y_offset = static_cast(desc.slice_h * n); + + for (u8 mip = 0; mip < desc.mipmaps; ++mip) { - .src = desc.external_handle, - .xform = surface_transform::coordinate_transform, - .level = 0, - .src_x = 0, - .src_y = static_cast(desc.slice_h * n), - .dst_x = 0, - .dst_y = 0, - .dst_z = n, - .src_w = desc.width, - .src_h = desc.height, - .dst_w = desc.width, - .dst_h = desc.height - }; + sections[section_id++] = + { + .src = desc.external_handle, + .xform = surface_transform::coordinate_transform, + .level = mip, + .src_x = 0, + .src_y = y_offset, + .dst_x = 0, + .dst_y = 0, + .dst_z = n, + .src_w = mip_w, + .src_h = mip_h, + .dst_w = mip_w, + .dst_h = mip_h + }; + + y_offset += mip_h; + mip_w = std::max(mip_w / 2, 1); + mip_h = std::max(mip_h / 2, 1); + } } result = generate_cubemap_from_images(cmd, desc.gcm_format, desc.width, sections, desc.remap); From cb276f0da7f1a886044d3f191d95955230e7e32d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 6 Apr 2026 22:55:06 +0300 Subject: [PATCH 04/11] vk: Insert transfer->transfer barriers before creating aggregates --- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 206903bf64..004061d881 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -787,6 +787,14 @@ namespace vk vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); } + vk::insert_image_memory_barrier( + cmd, + image->handle(), + image->current_layout, image->current_layout, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + dst_range); + copy_transfer_regions_impl(cmd, image, sections_to_copy); vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, dst_range); @@ -822,6 +830,14 @@ namespace vk vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); } + vk::insert_image_memory_barrier( + cmd, + image->handle(), + image->current_layout, image->current_layout, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + dst_range); + copy_transfer_regions_impl(cmd, image, sections_to_copy); vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, dst_range); @@ -860,6 +876,14 @@ namespace vk } } + vk::insert_image_memory_barrier( + cmd, + image->handle(), + image->current_layout, image->current_layout, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + dst_range); + copy_transfer_regions_impl(cmd, image, sections_to_copy); vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, dst_range); @@ -896,6 +920,14 @@ namespace vk vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); } + vk::insert_image_memory_barrier( + cmd, + image->handle(), + image->current_layout, image->current_layout, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + dst_range); + copy_transfer_regions_impl(cmd, image, sections_to_copy); vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, dst_range); @@ -1016,6 +1048,14 @@ namespace vk VkClearDepthStencilValue clear{ 1.f, 255 }; vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &range); } + + vk::insert_image_memory_barrier( + cmd, + image->handle(), + image->current_layout, image->current_layout, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + range); } } } From 5311f004d09e7771792405d74df510774a8e44e0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 6 Apr 2026 22:55:46 +0300 Subject: [PATCH 05/11] vk: Insert barriers on the scratch buffer when detiling memory --- rpcs3/Emu/RSX/VK/VKTexture.cpp | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index bc6aabf2d4..c5fd4f37e9 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1304,6 +1304,15 @@ namespace vk .image_bpp = bpp }; + // Pre-Transfer barrier + vk::insert_buffer_memory_barrier( + cmd, + scratch_buf->value, + tiled_data_scratch_offset, section_length, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT + ); + // Transfer VkBufferCopy copy_rgn { @@ -1313,16 +1322,25 @@ namespace vk }; vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, ©_rgn); - // Barrier + // Post-Transfer barrier vk::insert_buffer_memory_barrier( - cmd, scratch_buf->value, linear_data_scratch_offset, section_length, + cmd, scratch_buf->value, tiled_data_scratch_offset, section_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + // Pre-Compute barrier + vk::insert_buffer_memory_barrier( + cmd, + scratch_buf->value, + linear_data_scratch_offset, static_cast(width) * height * bpp, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_WRITE_BIT + ); + // Detile vk::get_compute_task>()->run(cmd, config); - // Barrier + // Post-Compute barrier vk::insert_buffer_memory_barrier( cmd, scratch_buf->value, linear_data_scratch_offset, static_cast(width) * height * bpp, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, From b9f05ba71b53d2074465b0b85ad9c8a21d9f70e1 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 6 Apr 2026 22:56:16 +0300 Subject: [PATCH 06/11] vk: Insert a all_commands->transfer barrier before copying query results to scratch --- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 5906c14824..d83e7490c8 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2915,9 +2915,19 @@ void VKGSRender::begin_conditional_rendering(const std::vectorvalue, 0, num_hw_queries * 4, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT); + } + const auto count = (query_range.last - query_range.first + 1); m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, query_range.first, count, scratch->value, dst_offset); dst_offset += count * 4; From 8ab0ceaa6762c9d807d7980b15abf44a7bdfc318 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 6 Apr 2026 23:50:44 +0300 Subject: [PATCH 07/11] vk: Wrap scratch buffer access with proper memory barriers --- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 15 ++++------- rpcs3/Emu/RSX/VK/VKRenderTargets.h | 2 +- rpcs3/Emu/RSX/VK/VKTexture.cpp | 39 +++++++--------------------- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 13 ++++++---- rpcs3/Emu/RSX/VK/vkutils/scratch.cpp | 8 +++++- rpcs3/Emu/RSX/VK/vkutils/scratch.h | 8 +++++- 6 files changed, 37 insertions(+), 48 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index d83e7490c8..8da72284d9 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2115,7 +2115,7 @@ void VKGSRender::load_program_env() if (vk::emulate_conditional_rendering()) { - const vk::buffer& predicate = m_cond_render_buffer ? *m_cond_render_buffer : *vk::get_scratch_buffer(*m_current_command_buffer, 4); + const vk::buffer& predicate = m_cond_render_buffer ? *m_cond_render_buffer : *vk::get_scratch_buffer(*m_current_command_buffer, 4, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_ACCESS_NONE); const u32 offset = cond_render_ctrl.hw_cond_active ? 0 : 4; m_program->bind_uniform({ predicate, offset, 4 }, vk::glsl::binding_set_index_vertex, m_vs_binding_table->cr_pred_buffer_location); } @@ -2910,22 +2910,17 @@ void VKGSRender::begin_conditional_rendering(const std::vector 0) { // We'll need to do some result aggregation using a compute shader. - auto scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4); + vk::buffer* scratch = nullptr; // Range latching. Because of how the query pool manages allocations using a stack, we get an inverse sequential set of handles/indices that we can easily group together. // This drastically boosts performance on some drivers like the NVIDIA proprietary one that seems to have a rather high cost for every individual query transer command. struct { u32 first, last; } query_range = { umax, 0 }; - bool need_barrier = true; auto copy_query_range_impl = [&]() { - if (need_barrier) + if (!scratch) { - need_barrier = false; - vk::insert_buffer_memory_barrier(*m_current_command_buffer, scratch->value, 0, num_hw_queries * 4, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); + scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); } const auto count = (query_range.last - query_range.first + 1); @@ -2974,7 +2969,7 @@ void VKGSRender::begin_conditional_rendering(const std::vectorsize()); + ensure(scratch && dst_offset <= scratch->size()); if (!partial_eval) { diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index c040e9bca0..7da26f3b5c 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -598,7 +598,7 @@ namespace vk const auto transfer_size = surface->get_memory_range().length(); if (transfer_size > max_copy_length || src_offset_in_buffer || surface->is_depth_surface()) { - auto scratch = vk::get_scratch_buffer(cmd, transfer_size * 4); + auto scratch = vk::get_scratch_buffer(cmd, transfer_size * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); dest = scratch; } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index c5fd4f37e9..c69064dbe4 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -376,7 +376,7 @@ namespace vk const auto min_scratch_size = calculate_working_buffer_size(src_length, src->aspect() | dst->aspect()); // Initialize scratch memory - auto scratch_buf = vk::get_scratch_buffer(cmd, min_scratch_size); + auto scratch_buf = vk::get_scratch_buffer(cmd, min_scratch_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); for (u32 mip_level = 0; mip_level < mipmaps; ++mip_level) { @@ -601,7 +601,7 @@ namespace vk const auto dst_w = dst_rect.width(); const auto dst_h = dst_rect.height(); - auto scratch_buf = vk::get_scratch_buffer(cmd, std::max(src_w, dst_w) * std::max(src_h, dst_h) * 4); + auto scratch_buf = vk::get_scratch_buffer(cmd, std::max(src_w, dst_w) * std::max(src_h, dst_h) * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); //1. Copy unscaled to typeless surface VkBufferImageCopy info{}; @@ -1124,7 +1124,7 @@ namespace vk scratch_buf_size += (image_linear_size * 5) / 4; } - scratch_buf = vk::get_scratch_buffer(cmd2, scratch_buf_size); + scratch_buf = vk::get_scratch_buffer(cmd2, scratch_buf_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); buffer_copies.reserve(subresource_layout.size()); } @@ -1183,13 +1183,6 @@ namespace vk { ensure(scratch_buf); - // WAW hazard - complete previous work before executing any transfers - insert_buffer_memory_barrier( - cmd2, scratch_buf->value, 0, scratch_offset, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); - if (upload_commands.size() > 1) { auto range_ptr = buffer_copies.data(); @@ -1199,8 +1192,9 @@ namespace vk range_ptr += op.second; } } - else if (!buffer_copies.empty()) + else { + ensure(!buffer_copies.empty()); vkCmdCopyBuffer(cmd2, upload_buffer->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); } @@ -1279,7 +1273,10 @@ namespace vk vk::load_dma(range.start, section_length); // Allocate scratch and prepare for the GPU job - const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data + const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3, // 0 = linear data, 1 = padding (deswz), 2 = tiled data + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT); + const auto tiled_data_scratch_offset = section_length * 2; const auto linear_data_scratch_offset = 0u; @@ -1304,15 +1301,6 @@ namespace vk .image_bpp = bpp }; - // Pre-Transfer barrier - vk::insert_buffer_memory_barrier( - cmd, - scratch_buf->value, - tiled_data_scratch_offset, section_length, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT - ); - // Transfer VkBufferCopy copy_rgn { @@ -1328,15 +1316,6 @@ namespace vk VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - // Pre-Compute barrier - vk::insert_buffer_memory_barrier( - cmd, - scratch_buf->value, - linear_data_scratch_offset, static_cast(width) * height * bpp, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_WRITE_BIT - ); - // Detile vk::get_compute_task>()->run(cmd, config); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 004061d881..a3fc048a27 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -130,9 +130,10 @@ namespace vk dma_sync_region = tiled_region.tile_align(dma_sync_region); } #endif - - auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length); u32 result_offset = 0; + auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length, + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT); VkBufferImageCopy region = {}; region.imageSubresource = { src->aspect(), 0, 0, 1 }; @@ -220,7 +221,7 @@ namespace vk // Transfer -> Compute barrier vk::insert_buffer_memory_barrier(cmd, working_buffer->value, dst_offset, dma_sync_region.length(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); } // Prepare payload @@ -284,8 +285,10 @@ namespace vk if (require_rw_barrier) { vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, dma_sync_region.length(), - VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT); } if (rsx_pitch == real_pitch) [[likely]] diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp index 041067bea6..04cfded55c 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp @@ -177,7 +177,7 @@ namespace vk return { scratch_buffer.get(), is_new }; } - vk::buffer* get_scratch_buffer(const vk::command_buffer& cmd, u64 min_required_size, bool zero_memory) + vk::buffer* get_scratch_buffer(const vk::command_buffer& cmd, u64 min_required_size, VkPipelineStageFlags dst_stage_flags, VkAccessFlags dst_access, bool zero_memory) { const auto [buf, init_mem] = get_scratch_buffer(cmd.get_queue_family(), min_required_size); @@ -191,6 +191,12 @@ namespace vk VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT); } + else if (dst_access != VK_ACCESS_NONE) + { + insert_buffer_memory_barrier(cmd, buf->value, 0, min_required_size, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_flags, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, dst_access); + } return buf; } diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.h b/rpcs3/Emu/RSX/VK/vkutils/scratch.h index 312db68c8a..5b31289462 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.h +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.h @@ -6,7 +6,13 @@ namespace vk VkSampler null_sampler(); image_view* null_image_view(const command_buffer& cmd, VkImageViewType type); image* get_typeless_helper(VkFormat format, rsx::format_class format_class, u32 requested_width, u32 requested_height); - buffer* get_scratch_buffer(const command_buffer& cmd, u64 min_required_size, bool zero_memory = false); + + buffer* get_scratch_buffer( + const command_buffer& cmd, + u64 min_required_size, + VkPipelineStageFlags dst_stage_flags, + VkAccessFlags dst_access, + bool zero_memory = false); void clear_scratch_resources(); } From 59468f1e1ea06339212e41cd747df1ac3f9a0ac3 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 7 Apr 2026 01:12:25 +0300 Subject: [PATCH 08/11] vk: Handle WAW and RAW hazards when performing "flush" operations --- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index a3fc048a27..671529f87d 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -100,7 +100,7 @@ namespace vk auto dma_sync_region = valid_range; dma_mapping_handle dma_mapping = { 0, nullptr }; - auto dma_sync = [&dma_sync_region, &dma_mapping](bool load, bool force = false) + auto dma_sync = [&](bool load, bool force = false) { if (dma_mapping.second && !force) { @@ -335,6 +335,14 @@ namespace vk vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dma_mapping.second->value, 1, ®ion); } + // Post-transfer barrier on dma layer + vk::insert_buffer_memory_barrier( + cmd, dma_mapping.second->value, + dma_mapping.first, dma_sync_region.length(), + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT + ); + src->pop_layout(cmd); VkBufferMemoryBarrier2KHR mem_barrier = From e0c3df5328d7beefc64e475535f714a385298886 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 7 Apr 2026 01:42:15 +0300 Subject: [PATCH 09/11] vk: Fix crash when running CPU detiler path --- rpcs3/Emu/RSX/VK/VKHelpers.h | 9 +++++---- rpcs3/Emu/RSX/VK/VKRenderTargets.cpp | 1 + rpcs3/Emu/RSX/VK/VKTexture.cpp | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 9d07a4581e..fae75c0724 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -71,10 +71,11 @@ namespace vk enum image_upload_options { - upload_contents_async = 1, - initialize_image_layout = 2, - preserve_image_layout = 4, - source_is_gpu_resident = 8, + upload_contents_async = 0x0001, + initialize_image_layout = 0x0002, + preserve_image_layout = 0x0004, + source_is_gpu_resident = 0x0008, + source_is_userptr = 0x0010, // meta-flags upload_contents_inline = 0, diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp index 138f7e46aa..736b886a77 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp @@ -724,6 +724,7 @@ namespace vk subres.height_in_block ); subres.data = std::span(ext_data); + upload_flags |= source_is_userptr; #else const auto [scratch_buf, linear_data_scratch_offset] = vk::detile_memory_block(cmd, tiled_region, range, subres.width_in_block, subres.height_in_block, get_bpp()); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index c69064dbe4..eaf5c0b710 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -990,7 +990,7 @@ namespace vk auto pdev = vk::get_current_renderer(); rsx::texture_uploader_capabilities caps{ .supports_dxt = pdev->get_texture_compression_bc_support(), .alignment = heap_align }; rsx::texture_memory_info opt{}; - bool check_caps = true; + bool check_hw_caps = !(image_setup_flags & source_is_userptr); vk::buffer* scratch_buf = nullptr; u32 scratch_offset = 0; @@ -1015,13 +1015,13 @@ namespace vk image_linear_size = row_pitch * layout.depth * (rsx::is_compressed_host_format(caps, format) ? layout.height_in_block : layout.height_in_texel); // Only do GPU-side conversion if occupancy is good - if (check_caps) + if (check_hw_caps) { caps.supports_byteswap = (image_linear_size >= 1024) || (image_setup_flags & source_is_gpu_resident); caps.supports_hw_deswizzle = caps.supports_byteswap; caps.supports_zero_copy = caps.supports_byteswap; caps.supports_vtc_decoding = false; - check_caps = false; + check_hw_caps = false; } auto buf_allocator = [&](usz) -> std::tuple From d7f8e25cca86a0eae5f5f5d4338babee5936023c Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 7 Apr 2026 13:16:27 +0200 Subject: [PATCH 10/11] SPU: Remove RCHCNT loop handling of SPU_WrOutMbox Fixes freezing in Half-Life 2 Fixes #17958 --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 6837baaa97..87c61042a6 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -4357,11 +4357,6 @@ public: { switch (op.ra) { - case SPU_WrOutMbox: - { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); - break; - } case SPU_WrOutIntrMbox: { res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); From beac01d5d1bb4ccc67d8f3ab3c86e9cd374c4acf Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 7 Apr 2026 13:35:20 +0200 Subject: [PATCH 11/11] SPU: Remove RCHCNT loop handling of SPU_WrOutIntrMbox --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 87c61042a6..f0a8c9f7db 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -4357,11 +4357,6 @@ public: { switch (op.ra) { - case SPU_WrOutIntrMbox: - { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); - break; - } case SPU_RdSigNotify1: { res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr1));