diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 5ff80facd..52d0b5527 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -255,4 +255,31 @@ bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_w return true; } +StreamBufferMapping::StreamBufferMapping(StreamBuffer& stream_buffer, u64 size, u64 alignment, + bool allow_wait) { + const auto [data, offset] = stream_buffer.Map(size, alignment, allow_wait); + if (!data) { + // This happens if the size is too big or no waiting is allowed when it is required + is_temp_buffer = true; + this->buffer = new VideoCore::Buffer(*stream_buffer.instance, *stream_buffer.scheduler, + stream_buffer.usage, 0, AllFlags, size); + this->data = this->buffer->mapped_data.data(); + this->offset = 0; + ASSERT_MSG(this->data, "Failed to map temporary buffer"); + } else { + is_temp_buffer = false; + buffer = &stream_buffer; + this->data = data; + this->offset = offset; + } +} + +StreamBufferMapping::~StreamBufferMapping() { + if (is_temp_buffer) { + ASSERT(buffer); + auto scheduler = buffer->scheduler; + scheduler->DeferOperation([buffer = this->buffer]() mutable { delete buffer; }); + } +} + } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 1f661ba13..675ba943e 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include #include @@ -210,4 +211,51 @@ private: u64 wait_bound{}; }; +class StreamBufferMapping { +public: + StreamBufferMapping(StreamBuffer& stream_buffer, u64 size, u64 alignment = 0, + bool allow_wait = true); + ~StreamBufferMapping(); + + StreamBufferMapping(const StreamBufferMapping&) = delete; + StreamBufferMapping& operator=(const StreamBufferMapping&) = delete; + + StreamBufferMapping(StreamBufferMapping&& other) + : buffer{std::exchange(other.buffer, nullptr)}, data{std::exchange(other.data, nullptr)}, + offset{std::exchange(other.offset, 0)}, + is_temp_buffer{std::exchange(other.is_temp_buffer, false)} {} + + StreamBufferMapping& operator=(StreamBufferMapping&& other) { + if (this != &other) { + buffer = std::exchange(other.buffer, nullptr); + data = std::exchange(other.data, nullptr); + offset = std::exchange(other.offset, 0); + is_temp_buffer = std::exchange(other.is_temp_buffer, false); + } + return *this; + } + + VideoCore::Buffer* Buffer() const { + return buffer; + } + + u8* Data() const { + return data; + } + + u64 Offset() const { + return offset; + } + + bool TemporaryBuffer() const { + return is_temp_buffer; + } + +private: + VideoCore::Buffer* buffer; + u8* data{}; + u64 offset{}; + bool is_temp_buffer{}; +}; + } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 7347e99a2..18ab2d7d0 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -73,12 +73,17 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s BufferCache::~BufferCache() = default; -void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool download) { if (!IsRegionRegistered(device_addr, size)) { return; } - memory_tracker->InvalidateRegion( - device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); }); + if (download) { + memory_tracker->InvalidateRegion( + device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); }); + } else { + memory_tracker->InvalidateRegion(device_addr, size); + gpu_modified_ranges.Subtract(device_addr, size); + } } void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { @@ -122,11 +127,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si download_buffer.Commit(); scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies); - const auto write_data = [&]() { + cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), copies); + + const auto write_func = [this, buf_addr = buffer.CpuAddr(), copies = std::move(copies), + download, offset, device_addr, size, is_write]() { auto* memory = Core::Memory::Instance(); for (const auto& copy : copies) { - const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; + const VAddr copy_device_addr = buf_addr + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; memory->TryWriteBacking(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); @@ -136,12 +143,67 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si memory_tracker->MarkRegionAsCpuModified(device_addr, size); } }; + if constexpr (async) { - scheduler.DeferOperation(write_data); + scheduler.DeferOperation(write_func); } else { scheduler.Finish(); - write_data(); + write_func(); } + + return; +} + +void BufferCache::ReadEdgeImagePages(const Image& image) { + // May happen that after downloading the image and invalidating region, + // that there were GPU modified ranges that are lost due to CPU reuploading. + // This doesn't change tracker state and it is spected to call DownloadImageMemory after this. + const VAddr image_addr = image.info.guest_address; + const u64 image_size = image.info.guest_size; + const VAddr image_end = image_addr + image_size; + const VAddr page_start = PageManager::GetPageAddr(image_addr); + const VAddr page_end = PageManager::GetNextPageAddr(image_end - 1); + boost::container::small_vector copies; + u64 total_size_bytes = 0; + const auto [buffer, offset] = ObtainBufferForImage(image_addr, image_size); + const auto add_download = [&](VAddr start, VAddr end) { + const u64 new_offset = start - buffer->CpuAddr(); + const u64 new_size = end - start; + copies.push_back(vk::BufferCopy{ + .srcOffset = new_offset, + .dstOffset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + }; + gpu_modified_ranges.ForEachInRange(page_start, image_addr - page_start, add_download); + gpu_modified_ranges.ForEachInRange(image_end, page_end - image_end, add_download); + gpu_modified_ranges.Subtract(page_start, page_end - page_start); + if (total_size_bytes == 0) { + return; + } + const auto [download, download_offset] = download_buffer.Map(total_size_bytes); + for (auto& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dstOffset += download_offset; + } + download_buffer.Commit(); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyBuffer(buffer->Handle(), download_buffer.Handle(), copies); + scheduler.DeferOperation([this, buf_addr = buffer->CpuAddr(), copies = std::move(copies), + download, download_offset, image_addr, image_size]() { + auto* memory = Core::Memory::Instance(); + for (const auto& copy : copies) { + const VAddr copy_device_addr = buf_addr + copy.srcOffset; + const u64 dst_offset = copy.dstOffset - download_offset; + memory->TryWriteBacking(std::bit_cast(copy_device_addr), download + dst_offset, + copy.size); + } + }); } void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { @@ -203,7 +265,7 @@ void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { // Map buffers for merged ranges for (auto& range : ranges_merged) { const u64 size = memory->ClampRangeSize(range.base_address, range.GetSize()); - const auto [buffer, offset] = ObtainBuffer(range.base_address, size, false); + const auto [buffer, offset] = ObtainBuffer(range.base_address, size); range.vk_buffer = buffer->buffer; range.offset = offset; } @@ -256,7 +318,7 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { // Bind index buffer. const u32 index_buffer_size = regs.num_indices * index_size; - const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size, false); + const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size); const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindIndexBuffer(vk_buffer->Handle(), offset, index_type); } @@ -275,7 +337,8 @@ void BufferCache::FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gd if (is_gds) { return &gds_buffer; } - const auto [buffer, offset] = ObtainBuffer(address, num_bytes, true); + const auto [buffer, offset] = + ObtainBuffer(address, num_bytes, ObtainBufferFlags::IsWritten); return buffer; }(); buffer->Fill(buffer->Offset(address), num_bytes, value); @@ -297,20 +360,19 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, if (src_gds) { return gds_buffer; } - const auto buffer_id = FindBuffer(src, num_bytes); - auto& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, src, num_bytes, false, true); - return buffer; + const auto [buffer, offset] = + ObtainBuffer(src, num_bytes, + ObtainBufferFlags::IgnoreStreamBuffer | ObtainBufferFlags::IsTexelBuffer | + ObtainBufferFlags::InvalidateTextureCache); + return *buffer; }(); auto& dst_buffer = [&] -> const Buffer& { if (dst_gds) { return gds_buffer; } - const auto buffer_id = FindBuffer(dst, num_bytes); - auto& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, dst, num_bytes, true, true); - gpu_modified_ranges.Add(dst, num_bytes); - return buffer; + const auto [buffer, offset] = ObtainBuffer( + dst, num_bytes, ObtainBufferFlags::IsWritten | ObtainBufferFlags::IsTexelBuffer); + return *buffer; }(); const vk::BufferCopy region = { .srcOffset = src_buffer.Offset(src), @@ -372,10 +434,14 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, }); } -std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, - bool is_texel_buffer, BufferId buffer_id) { +std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, + ObtainBufferFlags flags, BufferId buffer_id) { // For read-only buffers use device local stream buffer to reduce renderpass breaks. - if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) { + const bool is_written = True(flags & ObtainBufferFlags::IsWritten); + const bool is_texel_buffer = True(flags & ObtainBufferFlags::IsTexelBuffer); + const bool skip_stream_buffer = True(flags & ObtainBufferFlags::IgnoreStreamBuffer); + if (!is_written && !skip_stream_buffer && size <= CACHING_PAGESIZE && + !IsRegionGpuModified(device_addr, size)) { const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); return {&stream_buffer, offset}; } @@ -383,9 +449,13 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b buffer_id = FindBuffer(device_addr, size); } Buffer& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer); + const bool img_synced = + SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer); + if (img_synced && True(flags & ObtainBufferFlags::InvalidateTextureCache)) { + texture_cache.InvalidateMemoryFromGPU(device_addr, size); + } if (is_written) { - gpu_modified_ranges.Add(device_addr, size); + MarkRegionAsGpuModified(device_addr, size); } return {&buffer, buffer.Offset(device_addr)}; } @@ -401,7 +471,7 @@ std::pair BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 si } // If some buffer within was GPU modified create a full buffer to avoid losing GPU data. if (IsRegionGpuModified(gpu_addr, size)) { - return ObtainBuffer(gpu_addr, size, false, false); + return ObtainBuffer(gpu_addr, size); } // In all other cases, just do a CPU copy to the staging buffer. const auto [data, offset] = staging_buffer.Map(size, 16); @@ -423,6 +493,12 @@ bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { return memory_tracker->IsRegionGpuModified(addr, size); } +void BufferCache::MarkRegionAsGpuModified(VAddr addr, size_t size) { + gpu_modified_ranges.Add(addr, size); + memory_tracker->MarkRegionAsGpuModified(addr, size); + texture_cache.MarkAsMaybeReused(addr, size); +} + BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { if (device_addr == 0) { return NULL_BUFFER_ID; @@ -640,6 +716,7 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, size_t total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); vk::Buffer src_buffer = VK_NULL_HANDLE; + TouchBuffer(buffer); memory_tracker->ForEachUploadRange( device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { @@ -682,7 +759,6 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &post_barrier, }); - TouchBuffer(buffer); } if (is_texel_buffer && !is_written) { return SynchronizeBufferFromImage(buffer, device_addr, size); @@ -852,7 +928,6 @@ void BufferCache::RunGarbageCollector() { } --max_deletions; Buffer& buffer = slot_buffers[buffer_id]; - // InvalidateMemory(buffer.CpuAddr(), buffer.SizeBytes()); DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes(), true); DeleteBuffer(buffer_id); }; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 73d70704e..ad5fa3179 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -4,6 +4,7 @@ #pragma once #include +#include "common/enum.h" #include "common/lru_cache.h" #include "common/slot_vector.h" #include "common/types.h" @@ -11,6 +12,7 @@ #include "video_core/buffer_cache/fault_manager.h" #include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" +#include "video_core/texture_cache/image.h" namespace AmdGpu { struct Liverpool; @@ -34,6 +36,15 @@ class TextureCache; class MemoryTracker; class PageManager; +enum class ObtainBufferFlags { + None = 0, + IsWritten = 1 << 0, + IsTexelBuffer = 1 << 1, + IgnoreStreamBuffer = 1 << 2, + InvalidateTextureCache = 1 << 3, +}; +DECLARE_ENUM_FLAG_OPERATORS(ObtainBufferFlags) + class BufferCache { public: static constexpr u32 CACHING_PAGEBITS = 14; @@ -106,11 +117,14 @@ public: } /// Invalidates any buffer in the logical page range. - void InvalidateMemory(VAddr device_addr, u64 size); + void InvalidateMemory(VAddr device_addr, u64 size, bool download); /// Flushes any GPU modified buffer in the logical page range back to CPU memory. void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); + /// Flushes GPU modified ranges of the uncovered part of the edge pages of an image. + void ReadEdgeImagePages(const Image& image); + /// Binds host vertex buffers for the current draw. void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); @@ -124,9 +138,9 @@ public: void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); /// Obtains a buffer for the specified region. - [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, - bool is_texel_buffer = false, - BufferId buffer_id = {}); + [[nodiscard]] std::pair ObtainBuffer( + VAddr gpu_addr, u32 size, ObtainBufferFlags flags = ObtainBufferFlags::None, + BufferId buffer_id = {}); /// Attempts to obtain a buffer without modifying the cache contents. [[nodiscard]] std::pair ObtainBufferForImage(VAddr gpu_addr, u32 size); @@ -140,6 +154,9 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + /// Mark region as modified from the GPU + void MarkRegionAsGpuModified(VAddr addr, size_t size); + /// Return buffer id for the specified region BufferId FindBuffer(VAddr device_addr, u32 size); diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index a093be8dd..9c863fe0e 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -54,6 +54,15 @@ public: }); } + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages(dirty_cpu_addr, query_size, + [this](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { IteratePages(dirty_cpu_addr, query_size, @@ -78,6 +87,8 @@ public: manager->template IsRegionModified(offset, size)) { return true; } + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); manager->template ChangeRegionState( manager->GetCpuAddr() + offset, size); return false; @@ -88,6 +99,20 @@ public: }); } + /// Removes all protection from a page (lose any non downloaded GPU modifications) + void InvalidateRegion(VAddr cpu_addr, u64 size) noexcept { + IteratePages(cpu_addr, size, [](RegionManager* manager, u64 offset, size_t size) { + // Perform both the GPU modification check and CPU state change with the lock + // in case we are racing with GPU thread trying to mark the page as GPU + // modified. + std::scoped_lock lk{manager->lock}; + manager->template ChangeRegionState(manager->GetCpuAddr() + offset, + size); + manager->template ChangeRegionState(manager->GetCpuAddr() + offset, + size); + }); + } + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func, auto&& on_upload) { diff --git a/src/video_core/buffer_cache/range_set.h b/src/video_core/buffer_cache/range_set.h index 5c8e78c7c..b86801bd7 100644 --- a/src/video_core/buffer_cache/range_set.h +++ b/src/video_core/buffer_cache/range_set.h @@ -72,7 +72,7 @@ struct RangeSet { template void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { - if (m_ranges_set.empty()) { + if (m_ranges_set.empty() || size == 0) { return; } const VAddr start_address = base_addr; @@ -176,7 +176,7 @@ public: template void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { - if (m_ranges_map.empty()) { + if (m_ranges_map.empty() || size == 0) { return; } const VAddr start_address = base_addr; @@ -280,7 +280,7 @@ public: template void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { - if (m_ranges_map.empty()) { + if (m_ranges_map.empty() || size == 0) { return; } const VAddr start_address = base_addr; diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h index fb53f7c98..d1f90a7a2 100644 --- a/src/video_core/page_manager.h +++ b/src/video_core/page_manager.h @@ -53,6 +53,10 @@ public: return Common::AlignUp(addr + 1, PM_PAGE_SIZE); } + static constexpr size_t GetPageSize() { + return PAGE_SIZE; + } + private: struct Impl; std::unique_ptr impl; diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 7a8a906d5..302cedd62 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -431,7 +431,8 @@ public: /// Returns the total memory budget available to the device. [[nodiscard]] u64 GetTotalMemoryBudget() const { - return total_memory_budget; + return 2_GB; // TODO: this is for better garbage collection testing, temporary + // return total_memory_budget; } /// Determines if a format is supported for a set of feature flags. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 800941fe3..4fc228267 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -260,12 +260,12 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 } const auto& [buffer, base] = - buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count, false); + buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count); VideoCore::Buffer* count_buffer{}; u32 count_base{}; if (count_address != 0) { - std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4, false); + std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4); } pipeline->BindResources(set_writes, buffer_barriers, push_data); @@ -346,7 +346,7 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { return; } - const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); + const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size); scheduler.EndRendering(); pipeline->BindResources(set_writes, buffer_barriers, push_data); @@ -629,8 +629,15 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); } } else { - const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( - vsharp.base_address, size, desc.is_written, desc.is_formatted, buffer_id); + VideoCore::ObtainBufferFlags flags = {}; + if (desc.is_written) { + flags |= VideoCore::ObtainBufferFlags::IsWritten; + } + if (desc.is_formatted) { + flags |= VideoCore::ObtainBufferFlags::IsTexelBuffer; + } + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(vsharp.base_address, size, flags, buffer_id); const u32 offset_aligned = Common::AlignDown(offset, alignment); const u32 adjust = offset - offset_aligned; ASSERT(adjust % 4 == 0); @@ -1031,7 +1038,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { // Not GPU mapped memory, can skip invalidation logic entirely. return false; } - buffer_cache.InvalidateMemory(addr, size); + buffer_cache.InvalidateMemory(addr, size, true); texture_cache.InvalidateMemory(addr, size); return true; } @@ -1069,7 +1076,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) { } void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size); + buffer_cache.InvalidateMemory(addr, size, true); texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.cpp b/src/video_core/renderer_vulkan/vk_shader_hle.cpp index 61941892d..b3a0dabad 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_hle.cpp @@ -95,9 +95,10 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Compute // Obtain buffers for the total source and destination ranges. const auto [src_buf, src_buf_offset] = buffer_cache.ObtainBuffer( - src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min, false); + src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min); const auto [dst_buf, dst_buf_offset] = buffer_cache.ObtainBuffer( - dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, true); + dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, + VideoCore::ObtainBufferFlags::IgnoreStreamBuffer); // Apply found buffer base. const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start); @@ -117,6 +118,14 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Compute vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); + // Mark destination regions as GPU modified. + for (u32 i = 0; i < cs_program.dim_x; i++) { + const auto& [dst_idx, src_idx, end] = ctl_buf[i]; + const VAddr dst_addr = dst_buf_sharp.base_address + (dst_idx * buf_stride); + const u32 size = (end + 1) * buf_stride; + buffer_cache.MarkRegionAsGpuModified(dst_addr, size); + } + return true; } diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 0bf471dce..a3db31226 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -28,9 +28,11 @@ enum ImageFlagBits : u32 { Empty = 0, MaybeCpuDirty = 1 << 0, ///< The page this image is in was touched before the image address CpuDirty = 1 << 1, ///< Contents have been modified from the CPU - GpuDirty = 1 << 2, ///< Contents have been modified from the GPU (valid data in buffer cache) + GpuDirty = + 1 << 2, ///< Image contents have been modified from the GPU (valid data in buffer cache) Dirty = MaybeCpuDirty | CpuDirty | GpuDirty, GpuModified = 1 << 3, ///< Contents have been modified from the GPU + MaybeReused = 1 << 4, ///< Memory region containing this image was maybe reused by the GPU Registered = 1 << 6, ///< True when the image is registered Picked = 1 << 7, ///< Temporary flag to mark the image as picked }; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 163712756..6006712a2 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -23,9 +23,9 @@ static constexpr u64 NumFramesBeforeRemoval = 32; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_, BufferCache& buffer_cache_, - PageManager& tracker_) + PageManager& page_manager_) : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, - buffer_cache{buffer_cache_}, tracker{tracker_}, blit_helper{instance, scheduler}, + buffer_cache{buffer_cache_}, page_manager{page_manager_}, blit_helper{instance, scheduler}, tile_manager{instance, scheduler, buffer_cache.GetUtilityBuffer(MemoryUsage::Stream)} { // Create basic null image at fixed image ID. const auto null_id = GetNullImage(vk::Format::eR8G8B8A8Unorm); @@ -85,48 +85,70 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { void TextureCache::ProcessDownloadImages() { for (const ImageId image_id : download_images) { - DownloadImageMemory(image_id); + DownloadImageMemory(image_id); } download_images.clear(); } +template void TextureCache::DownloadImageMemory(ImageId image_id) { Image& image = slot_images[image_id]; if (False(image.flags & ImageFlagBits::GpuModified)) { return; } auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download); - const u32 download_size = image.info.pitch * image.info.size.height * - image.info.resources.layers * (image.info.num_bits / 8); - ASSERT(download_size <= image.info.guest_size); - const auto [download, offset] = download_buffer.Map(download_size); - download_buffer.Commit(); - const vk::BufferImageCopy image_download = { - .bufferOffset = offset, - .bufferRowLength = image.info.pitch, - .bufferImageHeight = image.info.size.height, - .imageSubresource = - { - .aspectMask = image.info.props.is_depth ? vk::ImageAspectFlagBits::eDepth - : vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, + const auto image_addr = image.info.guest_address; + const auto image_size = image.info.guest_size; + const auto image_mips = image.info.resources.levels; + u32 copy_size = 0; + boost::container::small_vector buffer_copies; + for (u32 mip = 0; mip < image_mips; ++mip) { + const auto& width = std::max(image.info.size.width >> mip, 1u); + const auto& height = std::max(image.info.size.height >> mip, 1u); + const auto& depth = + image.info.props.is_volume ? std::max(image.info.size.depth >> mip, 1u) : 1u; + const auto [mip_size, mip_pitch, mip_height, mip_offset] = image.info.mips_layout[mip]; + const u32 extent_width = mip_pitch ? std::min(mip_pitch, width) : width; + const u32 extent_height = mip_height ? std::min(mip_height, height) : height; + buffer_copies.push_back(vk::BufferImageCopy{ + .bufferOffset = mip_offset, + .bufferRowLength = mip_pitch, + .bufferImageHeight = mip_height, + .imageSubresource{ + .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, .layerCount = image.info.resources.layers, }, - .imageOffset = {0, 0, 0}, - .imageExtent = {image.info.size.width, image.info.size.height, 1}, - }; - scheduler.EndRendering(); - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); - cmdbuf.copyImageToBuffer(image.GetImage(), vk::ImageLayout::eTransferSrcOptimal, - download_buffer.Handle(), image_download); - - scheduler.DeferPriorityOperation( - [this, device_addr = image.info.guest_address, download, download_size] { - Core::Memory::Instance()->TryWriteBacking(std::bit_cast(device_addr), download, - download_size); + .imageOffset = {0, 0, 0}, + .imageExtent = {extent_width, extent_height, depth}, }); + copy_size += mip_size; + } + if (buffer_copies.empty()) { + return; + } + StreamBufferMapping mapping(download_buffer, image_size); + download_buffer.Commit(); + scheduler.EndRendering(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + tile_manager.TileImage(image, buffer_copies, mapping.Buffer()->Handle(), mapping.Offset(), + copy_size); + + const auto operation = [this, device_addr = image.info.guest_address, download = mapping.Data(), + image_size] { + Core::Memory::Instance()->TryWriteBacking(std::bit_cast(device_addr), download, + image_size); + if constexpr (!priority) { + buffer_cache.InvalidateMemory(device_addr, image_size, false); + } + }; + + if constexpr (priority) { + scheduler.DeferPriorityOperation(std::move(operation)); + } else { + scheduler.DeferOperation(std::move(operation)); + } } void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { @@ -183,6 +205,13 @@ void TextureCache::InvalidateMemoryFromGPU(VAddr address, size_t max_size) { }); } +void TextureCache::MarkAsMaybeReused(VAddr addr, size_t size) { + std::scoped_lock lock{mutex}; + ForEachImageInRegion(addr, size, [&](ImageId image_id, Image& image) { + image.flags |= ImageFlagBits::MaybeReused; + }); +} + void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { std::scoped_lock lk{mutex}; @@ -516,6 +545,7 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { TrackImage(new_image_id); new_image.flags &= ~ImageFlagBits::Dirty; + new_image.flags |= src_image.flags & ImageFlagBits::GpuModified; return new_image_id; } @@ -621,12 +651,15 @@ ImageId TextureCache::FindImageFromRange(VAddr address, size_t size, bool ensure if (image_ids.size() == 1) { // Sometimes image size might not exactly match with requested buffer size // If we only found 1 candidate image use it without too many questions. + Image& image = slot_images[image_ids[0]]; + TouchImage(image); return image_ids.back(); } if (!image_ids.empty()) { for (s32 i = 0; i < image_ids.size(); ++i) { Image& image = slot_images[image_ids[i]]; if (image.info.guest_size == size) { + TouchImage(image); return image_ids[i]; } } @@ -863,7 +896,7 @@ void TextureCache::TrackImage(ImageId image_id) { // Re-track the whole image image.track_addr = image_begin; image.track_addr_end = image_end; - tracker.UpdatePageWatchers<1>(image_begin, image.info.guest_size); + page_manager.UpdatePageWatchers<1>(image_begin, image.info.guest_size); } else { if (image_begin < image.track_addr) { TrackImageHead(image_id); @@ -886,7 +919,7 @@ void TextureCache::TrackImageHead(ImageId image_id) { ASSERT(image.track_addr != 0 && image_begin < image.track_addr); const auto size = image.track_addr - image_begin; image.track_addr = image_begin; - tracker.UpdatePageWatchers<1>(image_begin, size); + page_manager.UpdatePageWatchers<1>(image_begin, size); } void TextureCache::TrackImageTail(ImageId image_id) { @@ -902,7 +935,7 @@ void TextureCache::TrackImageTail(ImageId image_id) { const auto addr = image.track_addr_end; const auto size = image_end - image.track_addr_end; image.track_addr_end = image_end; - tracker.UpdatePageWatchers<1>(addr, size); + page_manager.UpdatePageWatchers<1>(addr, size); } void TextureCache::UntrackImage(ImageId image_id) { @@ -915,7 +948,7 @@ void TextureCache::UntrackImage(ImageId image_id) { image.track_addr = 0; image.track_addr_end = 0; if (size != 0) { - tracker.UpdatePageWatchers(addr, size); + page_manager.UpdatePageWatchers(addr, size); } } @@ -925,7 +958,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) { if (!image.IsTracked() || image_begin < image.track_addr) { return; } - const auto addr = tracker.GetNextPageAddr(image_begin); + const auto addr = page_manager.GetNextPageAddr(image_begin); const auto size = addr - image_begin; image.track_addr = addr; if (image.track_addr == image.track_addr_end) { @@ -934,7 +967,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePageWatchers(image_begin, size); + page_manager.UpdatePageWatchers(image_begin, size); } void TextureCache::UntrackImageTail(ImageId image_id) { @@ -944,7 +977,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) { return; } ASSERT(image.track_addr_end != 0); - const auto addr = tracker.GetPageAddr(image_end); + const auto addr = page_manager.GetPageAddr(image_end); const auto size = image_end - addr; image.track_addr_end = addr; if (image.track_addr == image.track_addr_end) { @@ -953,7 +986,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePageWatchers(addr, size); + page_manager.UpdatePageWatchers(addr, size); } void TextureCache::RunGarbageCollector() { @@ -971,6 +1004,7 @@ void TextureCache::RunGarbageCollector() { bool aggresive = false; u64 ticks_to_destroy = 0; size_t num_deletions = 0; + boost::container::small_vector download_pending; const auto configure = [&](bool allow_aggressive) { pressured = total_used_memory >= pressure_gc_memory; @@ -985,19 +1019,19 @@ void TextureCache::RunGarbageCollector() { } --num_deletions; auto& image = slot_images[image_id]; - const bool download = image.SafeToDownload(); - const bool tiled = image.info.IsTiled(); - if (tiled && download) { - // This is a workaround for now. We can't handle non-linear image downloads. - return false; - } + const bool download = + image.SafeToDownload() && False(image.flags & ImageFlagBits::MaybeReused); if (download && !pressured) { return false; } if (download) { - DownloadImageMemory(image_id); + download_pending.push_back(image_id); + buffer_cache.ReadEdgeImagePages(image); + UntrackImage(image_id); + UnregisterImage(image_id); + } else { + FreeImage(image_id); } - FreeImage(image_id); if (total_used_memory < critical_gc_memory) { if (aggresive) { num_deletions >>= 2; @@ -1021,10 +1055,26 @@ void TextureCache::RunGarbageCollector() { configure(true); lru_cache.ForEachItemBelow(gc_tick - ticks_to_destroy, clean_up); } + + for (const auto& image_id : download_pending) { + DownloadImageMemory(image_id); + DeleteImage(image_id); + } + + if (!download_pending.empty()) { + // We need to make downloads synchronous. It is possible that the contents + // of the image are requested before they are downloaded in which case + // outdated buffer cache contents are used instead. + scheduler.Finish(); + scheduler.PopPendingOperations(); + } } -void TextureCache::TouchImage(const Image& image) { +void TextureCache::TouchImage(Image& image) { lru_cache.Touch(image.lru_id, gc_tick); + + // Image is still valid + image.flags &= ~ImageFlagBits::MaybeReused; } void TextureCache::DeleteImage(ImageId image_id) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 141ac938f..713a6a7b8 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -3,12 +3,9 @@ #pragma once -#include #include -#include #include #include -#include #include #include "common/lru_cache.h" @@ -77,7 +74,8 @@ public: public: TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - AmdGpu::Liverpool* liverpool, BufferCache& buffer_cache, PageManager& tracker); + AmdGpu::Liverpool* liverpool, BufferCache& buffer_cache, + PageManager& page_manager); ~TextureCache(); TileManager& GetTileManager() noexcept { @@ -90,6 +88,9 @@ public: /// Marks an image as dirty if it exists at the provided address. void InvalidateMemoryFromGPU(VAddr address, size_t max_size); + /// Marks an image as maybe reused if it exists within the provided range. + void MarkAsMaybeReused(VAddr addr, size_t size); + /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); @@ -255,6 +256,7 @@ private: ImageId GetNullImage(vk::Format format); /// Copies image memory back to CPU. + template void DownloadImageMemory(ImageId image_id); /// Thread function for copying downloaded images out to CPU memory. @@ -285,7 +287,7 @@ private: void DeleteImage(ImageId image_id); /// Touch the image in the LRU cache. - void TouchImage(const Image& image); + void TouchImage(Image& image); void FreeImage(ImageId image_id) { UntrackImage(image_id); @@ -298,7 +300,7 @@ private: Vulkan::Scheduler& scheduler; AmdGpu::Liverpool* liverpool; BufferCache& buffer_cache; - PageManager& tracker; + PageManager& page_manager; BlitHelper blit_helper; TileManager tile_manager; Common::SlotVector slot_images;