diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index 6155bdb29..55fa8ede5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -445,19 +445,11 @@ Id EmitImageAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coo } Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) { - const auto& buffer = ctx.buffers[binding]; - const auto [id, pointer_type] = buffer.Alias(PointerType::U32); - const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); - const auto [scope, semantics]{AtomicArgs(ctx)}; - return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics); + UNREACHABLE_MSG("SPIR-V Instruction"); } Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) { - const auto& buffer = ctx.buffers[binding]; - const auto [id, pointer_type] = buffer.Alias(PointerType::U32); - const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); - const auto [scope, semantics]{AtomicArgs(ctx)}; - return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics); + UNREACHABLE_MSG("SPIR-V Instruction"); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 3c6fd3968..3edafd86c 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -70,13 +70,17 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { } return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2)); }(); + + IR::BufferInstInfo buffer_info{}; + buffer_info.pc.Assign(pc); + const IR::Value vsharp = ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1), ir.GetScalarReg(sbase + 2), ir.GetScalarReg(sbase + 3)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); - ir.SetScalarReg(dst_reg + i, ir.ReadConstBuffer(vsharp, index)); + ir.SetScalarReg(dst_reg + i, ir.ReadConstBuffer(vsharp, index, buffer_info)); } } diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 85ce2311a..e7030eca1 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -229,6 +229,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ } else { buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid); } + buffer_info.pc.Assign(pc); const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), @@ -296,6 +297,7 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer } else { buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid); } + buffer_info.pc.Assign(pc); const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), @@ -355,6 +357,7 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); + buffer_info.pc.Assign(pc); IR::Value vdata_val = [&] { if constexpr (std::is_same_v) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 922f01b1f..e45bcc15e 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -427,8 +427,8 @@ U32 IREmitter::ReadConst(const Value& base, const U32& offset) { return Inst(Opcode::ReadConst, base, offset); } -U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { - return Inst(Opcode::ReadConstBuffer, handle, index); +U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index, BufferInstInfo info) { + return Inst(Opcode::ReadConstBuffer, Flags{info}, handle, index); } U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index ec0edfed4..a7e45c69f 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -115,7 +115,7 @@ public: [[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); - [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); + [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index, BufferInstInfo info); [[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); @@ -422,7 +422,7 @@ private: } template - requires(sizeof(T) <= sizeof(u32) && std::is_trivially_copyable_v) + requires(sizeof(T) <= sizeof(u64) && std::is_trivially_copyable_v) struct Flags { Flags() = default; Flags(T proxy_) : proxy{proxy_} {} @@ -432,7 +432,7 @@ private: template T Inst(Opcode op, Flags flags, Args... args) { - u32 raw_flags{}; + u64 raw_flags{}; std::memcpy(&raw_flags, &flags.proxy, sizeof(flags.proxy)); auto it{block->PrependNewInst(insertion_point, op, {Value{args}...}, raw_flags)}; it->SetParent(block); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 52942338b..3c70969a1 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -126,7 +126,8 @@ bool IsDataRingInstruction(const IR::Inst& inst) { } } -IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { +IR::Type BufferDataType(const IR::Inst& inst, const Profile& profile, + AmdGpu::NumberFormat num_format) { switch (inst.GetOpcode()) { case IR::Opcode::LoadBufferU8: case IR::Opcode::StoreBufferU8: @@ -144,7 +145,7 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { return IR::Type::U64; case IR::Opcode::BufferAtomicFMax32: case IR::Opcode::BufferAtomicFMin32: - return IR::Type::F32; + return profile.supports_buffer_fp32_atomic_min_max ? IR::Type::F32 : IR::Type::U32; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: // Formatted buffer loads can use a variety of types. @@ -489,7 +490,8 @@ SharpLocation TrackSharp(const IR::Inst* inst, const IR::Block& current_parent, return SharpLocationFromSource(sources[0]); } -void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { +void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors, + const Profile& profile) { IR::Inst* handle = inst.Arg(0).InstRecursive(); u32 buffer_binding = 0; if (handle->AreAllArgsImmediates()) { @@ -509,18 +511,19 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& const auto buffer = std::bit_cast(raw); buffer_binding = descriptors.Add(BufferResource{ .sharp_idx = std::numeric_limits::max(), - .used_types = BufferDataType(inst, buffer.GetNumberFmt()), + .used_types = BufferDataType(inst, profile, buffer.GetNumberFmt()), .inline_cbuf = buffer, .buffer_type = BufferType::Guest, }); } else { // Normal buffer resource. IR::Inst* buffer_handle = handle->Arg(0).InstRecursive(); - const auto sharp_idx = TrackSharp(buffer_handle, block); + const auto inst_info = inst.Flags(); + const auto sharp_idx = TrackSharp(buffer_handle, block, inst_info.pc); const auto buffer = info.ReadUdSharp(sharp_idx); buffer_binding = descriptors.Add(BufferResource{ .sharp_idx = sharp_idx, - .used_types = BufferDataType(inst, buffer.GetNumberFmt()), + .used_types = BufferDataType(inst, profile, buffer.GetNumberFmt()), .buffer_type = BufferType::Guest, .is_written = IsBufferStore(inst), .is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || @@ -662,7 +665,7 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, - Descriptors& descriptors) { + Descriptors& descriptors, const Profile& profile) { const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, .inline_cbuf = AmdGpu::Buffer::Null(), @@ -700,9 +703,13 @@ void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, gds_addr = m0_val & 0xFFFF; } - // Patch instruction. - inst.SetArg(0, ir.Imm32(gds_addr >> 2)); - inst.SetArg(1, ir.Imm32(binding)); + // Patch instruction to GDS buffer atomic increment/decrement. + const IR::U32 handle = ir.Imm32(binding); + const IR::U32 index = ir.Imm32(gds_addr >> 2); + const bool is_append = inst.GetOpcode() == IR::Opcode::DataAppend; + const IR::Value prev = is_append ? ir.BufferAtomicInc(handle, index, {}) + : ir.BufferAtomicDec(handle, index, {}); + inst.ReplaceUsesWithAndRemove(prev); } else { // Convert shared memory opcode to storage buffer atomic to GDS buffer. auto& buffer = info.buffers[binding]; @@ -1187,7 +1194,7 @@ void ResourceTrackingPass(IR::Program& program, const Profile& profile) { for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { - PatchBufferSharp(*block, inst, info, descriptors); + PatchBufferSharp(*block, inst, info, descriptors, profile); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors, profile); } @@ -1202,7 +1209,7 @@ void ResourceTrackingPass(IR::Program& program, const Profile& profile) { } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); } else if (IsDataRingInstruction(inst)) { - PatchGlobalDataShareAccess(*block, inst, info, descriptors); + PatchGlobalDataShareAccess(*block, inst, info, descriptors, profile); } } } diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 2a54002f0..a75d36b3a 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -73,7 +73,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim const auto attrib = IR::Value{offset < 16 ? IR::Attribute::Position0 : IR::Attribute::Param0 + (offset / 16 - 1)}; - const auto comp = (offset / 4) % 4; + const u32 comp = (offset / 4) % 4; inst.ReplaceOpcode(IR::Opcode::SetAttribute); inst.ClearArgs(); diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index ee71cfc4e..51c959710 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -49,15 +49,16 @@ union TextureInstInfo { }; union BufferInstInfo { - u32 raw; - BitField<0, 1, u32> index_enable; - BitField<1, 1, u32> voffset_enable; - BitField<2, 12, u32> inst_offset; - BitField<14, 1, u32> system_coherent; - BitField<15, 1, u32> globally_coherent; - BitField<16, 1, u32> typed; + u64 raw; + BitField<0, 1, u64> index_enable; + BitField<1, 1, u64> voffset_enable; + BitField<2, 12, u64> inst_offset; + BitField<14, 1, u64> system_coherent; + BitField<15, 1, u64> globally_coherent; + BitField<16, 1, u64> typed; BitField<17, 4, AmdGpu::DataFormat> inst_data_fmt; BitField<21, 3, AmdGpu::NumberFormat> inst_num_fmt; + BitField<32, 16, u64> pc; }; enum class ScalarReg : u32 { diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index bca44125b..6ecd8ae0e 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -200,7 +200,7 @@ public: void ReplaceOpcode(IR::Opcode opcode); template - requires(sizeof(FlagsType) <= sizeof(u32) && std::is_trivially_copyable_v) + requires(sizeof(FlagsType) <= sizeof(u64) && std::is_trivially_copyable_v) [[nodiscard]] FlagsType Flags() const noexcept { FlagsType ret; std::memcpy(reinterpret_cast(&ret), &flags, sizeof(ret)); @@ -208,7 +208,7 @@ public: } template - requires(sizeof(FlagsType) <= sizeof(u32) && std::is_trivially_copyable_v) + requires(sizeof(FlagsType) <= sizeof(u64) && std::is_trivially_copyable_v) void SetFlags(FlagsType value) noexcept { std::memcpy(&flags, &value, sizeof(value)); } @@ -239,8 +239,8 @@ private: void ReplaceUsesWith(Value replacement, bool preserve); IR::Opcode op{}; - u32 flags{}; u32 definition{}; + u64 flags{}; IR::Block* parent{}; union { NonTriviallyDummy dummy{};