// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/control_flow_graph.h" #include "shader_recompiler/info.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/operand_helper.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/ir/reinterpret.h" #include "shader_recompiler/profile.h" #include "video_core/amdgpu/resource.h" namespace Shader::Optimization { namespace { using SharpLocation = u32; bool IsBufferAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::BufferAtomicIAdd32: case IR::Opcode::BufferAtomicIAdd64: case IR::Opcode::BufferAtomicISub32: case IR::Opcode::BufferAtomicSMin32: case IR::Opcode::BufferAtomicSMin64: case IR::Opcode::BufferAtomicUMin32: case IR::Opcode::BufferAtomicUMin64: case IR::Opcode::BufferAtomicFMin32: case IR::Opcode::BufferAtomicSMax32: case IR::Opcode::BufferAtomicSMax64: case IR::Opcode::BufferAtomicUMax32: case IR::Opcode::BufferAtomicUMax64: case IR::Opcode::BufferAtomicFMax32: case IR::Opcode::BufferAtomicInc32: case IR::Opcode::BufferAtomicDec32: case IR::Opcode::BufferAtomicAnd32: case IR::Opcode::BufferAtomicOr32: case IR::Opcode::BufferAtomicXor32: case IR::Opcode::BufferAtomicSwap32: case IR::Opcode::BufferAtomicCmpSwap32: case IR::Opcode::BufferAtomicFCmpSwap32: return true; default: return false; } } bool IsBufferStore(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::StoreBufferU8: case IR::Opcode::StoreBufferU16: case IR::Opcode::StoreBufferU32: case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: case IR::Opcode::StoreBufferU64: case IR::Opcode::StoreBufferF32: case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: case IR::Opcode::StoreBufferFormatF32: return true; default: return IsBufferAtomic(inst); } } bool IsBufferInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::LoadBufferU8: case IR::Opcode::LoadBufferU16: case IR::Opcode::LoadBufferU32: case IR::Opcode::LoadBufferU32x2: case IR::Opcode::LoadBufferU32x3: case IR::Opcode::LoadBufferU32x4: case IR::Opcode::LoadBufferU64: case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::ReadConstBuffer: return true; default: return IsBufferStore(inst); } } u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) { switch (inst.GetOpcode()) { case IR::Opcode::LoadBufferU8: case IR::Opcode::StoreBufferU8: return 0; case IR::Opcode::LoadBufferU16: case IR::Opcode::StoreBufferU16: return 1; case IR::Opcode::LoadBufferU64: case IR::Opcode::StoreBufferU64: case IR::Opcode::BufferAtomicIAdd64: case IR::Opcode::BufferAtomicSMax64: case IR::Opcode::BufferAtomicSMin64: case IR::Opcode::BufferAtomicUMax64: case IR::Opcode::BufferAtomicUMin64: return 3; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: { switch (data_format) { case AmdGpu::DataFormat::Format8: return 0; case AmdGpu::DataFormat::Format8_8: case AmdGpu::DataFormat::Format16: return 1; case AmdGpu::DataFormat::Format8_8_8_8: case AmdGpu::DataFormat::Format16_16: case AmdGpu::DataFormat::Format10_11_11: case AmdGpu::DataFormat::Format2_10_10_10: case AmdGpu::DataFormat::Format16_16_16_16: case AmdGpu::DataFormat::Format32: case AmdGpu::DataFormat::Format32_32: case AmdGpu::DataFormat::Format32_32_32: case AmdGpu::DataFormat::Format32_32_32_32: return 2; default: return 0; } break; } case IR::Opcode::ReadConstBuffer: // Provided address is already in dwords return 0; default: return 2; } } class Descriptors { public: explicit Descriptors(Info& info_) : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images}, sampler_resources{info_.samplers}, fmask_resources(info_.fmasks) {} u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf && desc.buffer_type == existing.buffer_type; })}; auto& buffer = buffer_resources[index]; buffer.used_types |= desc.used_types; buffer.is_written |= desc.is_written; buffer.is_formatted |= desc.is_formatted; return index; } u32 Add(const ImageResource& desc) { const u32 index{Add(image_resources, desc, [&desc](const auto& existing) { return desc.sharp_idx == existing.sharp_idx && desc.is_array == existing.is_array && desc.mip_fallback_mode == existing.mip_fallback_mode && desc.constant_mip_index == existing.constant_mip_index; })}; auto& image = image_resources[index]; image.is_atomic |= desc.is_atomic; image.is_written |= desc.is_written; return index; } u32 Add(const SamplerResource& desc) { const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) { return desc.sharp_idx == existing.sharp_idx && desc.is_inline_sampler == existing.is_inline_sampler && desc.inline_sampler == existing.inline_sampler; })}; return index; } u32 Add(const FMaskResource& desc) { u32 index = Add(fmask_resources, desc, [&desc](const auto& existing) { return desc.sharp_idx == existing.sharp_idx; }); return index; } private: template static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) { const auto it{std::ranges::find_if(descriptors, pred)}; if (it != descriptors.end()) { return static_cast(std::distance(descriptors.begin(), it)); } descriptors.push_back(desc); return static_cast(descriptors.size()) - 1; } const Info& info; BufferResourceList& buffer_resources; ImageResourceList& image_resources; SamplerResourceList& sampler_resources; FMaskResourceList& fmask_resources; }; } // Anonymous namespace void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors, const Profile& profile) { u32 buffer_binding = descriptors.Add(BufferResource{.sharp_idx = 0, .used_types = IR::Type::U32, .buffer_type = BufferType::Guest, .is_written = true, .is_formatted = false}); // Replace handle with binding index in buffer resource list. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; inst.SetArg(0, ir.Imm32(buffer_binding)); } IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, const AmdGpu::Buffer& buffer, u32 stride) { const auto inst_info = inst.Flags(); const u32 inst_offset = inst_info.inst_offset.Value(); const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid; const auto data_format = is_inst_typed ? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value()) : buffer.GetDataFmt(); const u32 shift = BufferAddressShift(inst, data_format); const u32 mask = (1 << shift) - 1; const IR::U32 soffset = IR::GetBufferSOffsetArg(&inst); // If address calculation is of the form "index * const_stride + offset" with // offset constant and both const_stride and offset are divisible with the // element size, apply shift directly. if (inst_info.index_enable && !inst_info.voffset_enable && soffset.IsImmediate() && !buffer.swizzle_enable && !buffer.add_tid_enable && (stride & mask) == 0) { const u32 total_offset = soffset.U32() + inst_offset; if ((total_offset & mask) == 0) { // buffer_offset = index * (const_stride >> shift) + (offset >> shift) const IR::U32 index = IR::GetBufferIndexArg(&inst); return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(total_offset >> shift)); } } // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? // thread_id[5:0] : 0) IR::U32 index = ir.Imm32(0U); if (inst_info.index_enable) { const IR::U32 vgpr_index = IR::GetBufferIndexArg(&inst); index = ir.IAdd(index, vgpr_index); } if (buffer.add_tid_enable) { ASSERT_MSG(info.l_stage == LogicalStage::Compute, "Thread ID buffer addressing is not supported outside of compute."); const IR::U32 thread_id{ir.LaneId()}; index = ir.IAdd(index, thread_id); } // offset = (inst_offen ? vgpr_offset : 0) + inst_offset IR::U32 offset = ir.Imm32(inst_offset); offset = ir.IAdd(offset, soffset); if (inst_info.voffset_enable) { const IR::U32 voffset = IR::GetBufferVOffsetArg(&inst); offset = ir.IAdd(offset, voffset); } const IR::U32 const_stride = ir.Imm32(stride); IR::U32 buffer_offset; if (buffer.swizzle_enable) { const IR::U32 const_index_stride = ir.Imm32(buffer.GetIndexStride()); const IR::U32 const_element_size = ir.Imm32(buffer.GetElementSize()); // index_msb = index / const_index_stride const IR::U32 index_msb{ir.IDiv(index, const_index_stride)}; // index_lsb = index % const_index_stride const IR::U32 index_lsb{ir.IMod(index, const_index_stride)}; // offset_msb = offset / const_element_size const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)}; // offset_lsb = offset % const_element_size const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)}; // buffer_offset = // (index_msb * const_stride + offset_msb * const_element_size) * // const_index_stride // + index_lsb * const_element_size + offset_lsb const IR::U32 buffer_offset_msb = ir.IMul( ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)), const_index_stride); const IR::U32 buffer_offset_lsb = ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb); buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb); } else { // buffer_offset = index * const_stride + offset buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset); } if (shift != 0) { buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift)); } return buffer_offset; } void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { const auto handle = inst.Arg(0); const auto buffer_res = info.buffers[handle.U32()]; const auto buffer = AmdGpu::Buffer::Null(); // Address of constant buffer reads can be calculated at IR emission time. if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) { return; } IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; inst.SetArg(IR::LoadBufferArgs::Address, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride)); } void ResourceTrackingPassStub(IR::Program& program, const Profile& profile) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; // Pass 1: Track resource sharps Descriptors descriptors{info}; for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferSharp(*block, inst, info, descriptors, profile); } } } // Pass 2: Patch instruction args for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferArgs(*block, inst, info); } } } } } // namespace Shader::Optimization