diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index d83e7490c8..8da72284d9 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2115,7 +2115,7 @@ void VKGSRender::load_program_env() if (vk::emulate_conditional_rendering()) { - const vk::buffer& predicate = m_cond_render_buffer ? *m_cond_render_buffer : *vk::get_scratch_buffer(*m_current_command_buffer, 4); + const vk::buffer& predicate = m_cond_render_buffer ? *m_cond_render_buffer : *vk::get_scratch_buffer(*m_current_command_buffer, 4, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_ACCESS_NONE); const u32 offset = cond_render_ctrl.hw_cond_active ? 0 : 4; m_program->bind_uniform({ predicate, offset, 4 }, vk::glsl::binding_set_index_vertex, m_vs_binding_table->cr_pred_buffer_location); } @@ -2910,22 +2910,17 @@ void VKGSRender::begin_conditional_rendering(const std::vector 0) { // We'll need to do some result aggregation using a compute shader. - auto scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4); + vk::buffer* scratch = nullptr; // Range latching. Because of how the query pool manages allocations using a stack, we get an inverse sequential set of handles/indices that we can easily group together. // This drastically boosts performance on some drivers like the NVIDIA proprietary one that seems to have a rather high cost for every individual query transer command. struct { u32 first, last; } query_range = { umax, 0 }; - bool need_barrier = true; auto copy_query_range_impl = [&]() { - if (need_barrier) + if (!scratch) { - need_barrier = false; - vk::insert_buffer_memory_barrier(*m_current_command_buffer, scratch->value, 0, num_hw_queries * 4, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); + scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); } const auto count = (query_range.last - query_range.first + 1); @@ -2974,7 +2969,7 @@ void VKGSRender::begin_conditional_rendering(const std::vectorsize()); + ensure(scratch && dst_offset <= scratch->size()); if (!partial_eval) { diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index c040e9bca0..7da26f3b5c 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -598,7 +598,7 @@ namespace vk const auto transfer_size = surface->get_memory_range().length(); if (transfer_size > max_copy_length || src_offset_in_buffer || surface->is_depth_surface()) { - auto scratch = vk::get_scratch_buffer(cmd, transfer_size * 4); + auto scratch = vk::get_scratch_buffer(cmd, transfer_size * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); dest = scratch; } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index c5fd4f37e9..c69064dbe4 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -376,7 +376,7 @@ namespace vk const auto min_scratch_size = calculate_working_buffer_size(src_length, src->aspect() | dst->aspect()); // Initialize scratch memory - auto scratch_buf = vk::get_scratch_buffer(cmd, min_scratch_size); + auto scratch_buf = vk::get_scratch_buffer(cmd, min_scratch_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); for (u32 mip_level = 0; mip_level < mipmaps; ++mip_level) { @@ -601,7 +601,7 @@ namespace vk const auto dst_w = dst_rect.width(); const auto dst_h = dst_rect.height(); - auto scratch_buf = vk::get_scratch_buffer(cmd, std::max(src_w, dst_w) * std::max(src_h, dst_h) * 4); + auto scratch_buf = vk::get_scratch_buffer(cmd, std::max(src_w, dst_w) * std::max(src_h, dst_h) * 4, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); //1. Copy unscaled to typeless surface VkBufferImageCopy info{}; @@ -1124,7 +1124,7 @@ namespace vk scratch_buf_size += (image_linear_size * 5) / 4; } - scratch_buf = vk::get_scratch_buffer(cmd2, scratch_buf_size); + scratch_buf = vk::get_scratch_buffer(cmd2, scratch_buf_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); buffer_copies.reserve(subresource_layout.size()); } @@ -1183,13 +1183,6 @@ namespace vk { ensure(scratch_buf); - // WAW hazard - complete previous work before executing any transfers - insert_buffer_memory_barrier( - cmd2, scratch_buf->value, 0, scratch_offset, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); - if (upload_commands.size() > 1) { auto range_ptr = buffer_copies.data(); @@ -1199,8 +1192,9 @@ namespace vk range_ptr += op.second; } } - else if (!buffer_copies.empty()) + else { + ensure(!buffer_copies.empty()); vkCmdCopyBuffer(cmd2, upload_buffer->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); } @@ -1279,7 +1273,10 @@ namespace vk vk::load_dma(range.start, section_length); // Allocate scratch and prepare for the GPU job - const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data + const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3, // 0 = linear data, 1 = padding (deswz), 2 = tiled data + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT); + const auto tiled_data_scratch_offset = section_length * 2; const auto linear_data_scratch_offset = 0u; @@ -1304,15 +1301,6 @@ namespace vk .image_bpp = bpp }; - // Pre-Transfer barrier - vk::insert_buffer_memory_barrier( - cmd, - scratch_buf->value, - tiled_data_scratch_offset, section_length, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT - ); - // Transfer VkBufferCopy copy_rgn { @@ -1328,15 +1316,6 @@ namespace vk VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - // Pre-Compute barrier - vk::insert_buffer_memory_barrier( - cmd, - scratch_buf->value, - linear_data_scratch_offset, static_cast(width) * height * bpp, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_WRITE_BIT - ); - // Detile vk::get_compute_task>()->run(cmd, config); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 004061d881..a3fc048a27 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -130,9 +130,10 @@ namespace vk dma_sync_region = tiled_region.tile_align(dma_sync_region); } #endif - - auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length); u32 result_offset = 0; + auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length, + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT); VkBufferImageCopy region = {}; region.imageSubresource = { src->aspect(), 0, 0, 1 }; @@ -220,7 +221,7 @@ namespace vk // Transfer -> Compute barrier vk::insert_buffer_memory_barrier(cmd, working_buffer->value, dst_offset, dma_sync_region.length(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); } // Prepare payload @@ -284,8 +285,10 @@ namespace vk if (require_rw_barrier) { vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, dma_sync_region.length(), - VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT); } if (rsx_pitch == real_pitch) [[likely]] diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp index 041067bea6..04cfded55c 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp @@ -177,7 +177,7 @@ namespace vk return { scratch_buffer.get(), is_new }; } - vk::buffer* get_scratch_buffer(const vk::command_buffer& cmd, u64 min_required_size, bool zero_memory) + vk::buffer* get_scratch_buffer(const vk::command_buffer& cmd, u64 min_required_size, VkPipelineStageFlags dst_stage_flags, VkAccessFlags dst_access, bool zero_memory) { const auto [buf, init_mem] = get_scratch_buffer(cmd.get_queue_family(), min_required_size); @@ -191,6 +191,12 @@ namespace vk VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT); } + else if (dst_access != VK_ACCESS_NONE) + { + insert_buffer_memory_barrier(cmd, buf->value, 0, min_required_size, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage_flags, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, dst_access); + } return buf; } diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.h b/rpcs3/Emu/RSX/VK/vkutils/scratch.h index 312db68c8a..5b31289462 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.h +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.h @@ -6,7 +6,13 @@ namespace vk VkSampler null_sampler(); image_view* null_image_view(const command_buffer& cmd, VkImageViewType type); image* get_typeless_helper(VkFormat format, rsx::format_class format_class, u32 requested_width, u32 requested_height); - buffer* get_scratch_buffer(const command_buffer& cmd, u64 min_required_size, bool zero_memory = false); + + buffer* get_scratch_buffer( + const command_buffer& cmd, + u64 min_required_size, + VkPipelineStageFlags dst_stage_flags, + VkAccessFlags dst_access, + bool zero_memory = false); void clear_scratch_resources(); }