mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2026-04-18 08:51:28 -06:00
shader_recompiler: Minor improvements to buffer atomics (#4242)
* resource_tracking_pass: Adjust buffer type if host doesn't support float buffer atomic * resource_tracking_pass: Implement data append/consume as buffer atomics in IR level This was previously done in spirv backend, the implementation was exactly the same as the buffer atomics, so unify them * ir: Bump instruction flag to 8 bytes * frontend: Pass pc to buffer flags for better debugging when sharp tracking fails * clang format --------- Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
This commit is contained in:
parent
3381f5d7d0
commit
0d3b6f7dd0
@ -445,19 +445,11 @@ Id EmitImageAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coo
|
||||
}
|
||||
|
||||
Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
|
||||
const auto& buffer = ctx.buffers[binding];
|
||||
const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
|
||||
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
|
||||
const auto [scope, semantics]{AtomicArgs(ctx)};
|
||||
return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics);
|
||||
UNREACHABLE_MSG("SPIR-V Instruction");
|
||||
}
|
||||
|
||||
Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) {
|
||||
const auto& buffer = ctx.buffers[binding];
|
||||
const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
|
||||
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
|
||||
const auto [scope, semantics]{AtomicArgs(ctx)};
|
||||
return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics);
|
||||
UNREACHABLE_MSG("SPIR-V Instruction");
|
||||
}
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
||||
@ -70,13 +70,17 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
|
||||
}
|
||||
return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2));
|
||||
}();
|
||||
|
||||
IR::BufferInstInfo buffer_info{};
|
||||
buffer_info.pc.Assign(pc);
|
||||
|
||||
const IR::Value vsharp =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1),
|
||||
ir.GetScalarReg(sbase + 2), ir.GetScalarReg(sbase + 3));
|
||||
IR::ScalarReg dst_reg{inst.dst[0].code};
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));
|
||||
ir.SetScalarReg(dst_reg + i, ir.ReadConstBuffer(vsharp, index));
|
||||
ir.SetScalarReg(dst_reg + i, ir.ReadConstBuffer(vsharp, index, buffer_info));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -229,6 +229,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_
|
||||
} else {
|
||||
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
|
||||
}
|
||||
buffer_info.pc.Assign(pc);
|
||||
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
@ -296,6 +297,7 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer
|
||||
} else {
|
||||
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
|
||||
}
|
||||
buffer_info.pc.Assign(pc);
|
||||
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
@ -355,6 +357,7 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
|
||||
buffer_info.inst_offset.Assign(mubuf.offset);
|
||||
buffer_info.globally_coherent.Assign(mubuf.glc);
|
||||
buffer_info.system_coherent.Assign(mubuf.slc);
|
||||
buffer_info.pc.Assign(pc);
|
||||
|
||||
IR::Value vdata_val = [&] {
|
||||
if constexpr (std::is_same_v<T, IR::U32>) {
|
||||
|
||||
@ -427,8 +427,8 @@ U32 IREmitter::ReadConst(const Value& base, const U32& offset) {
|
||||
return Inst<U32>(Opcode::ReadConst, base, offset);
|
||||
}
|
||||
|
||||
U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
|
||||
return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
|
||||
U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index, BufferInstInfo info) {
|
||||
return Inst<U32>(Opcode::ReadConstBuffer, Flags{info}, handle, index);
|
||||
}
|
||||
|
||||
U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
|
||||
|
||||
@ -115,7 +115,7 @@ public:
|
||||
[[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds);
|
||||
|
||||
[[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
|
||||
[[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
|
||||
[[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index, BufferInstInfo info);
|
||||
|
||||
[[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
|
||||
[[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
|
||||
@ -422,7 +422,7 @@ private:
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
requires(sizeof(T) <= sizeof(u32) && std::is_trivially_copyable_v<T>)
|
||||
requires(sizeof(T) <= sizeof(u64) && std::is_trivially_copyable_v<T>)
|
||||
struct Flags {
|
||||
Flags() = default;
|
||||
Flags(T proxy_) : proxy{proxy_} {}
|
||||
@ -432,7 +432,7 @@ private:
|
||||
|
||||
template <typename T = Value, typename FlagType, typename... Args>
|
||||
T Inst(Opcode op, Flags<FlagType> flags, Args... args) {
|
||||
u32 raw_flags{};
|
||||
u64 raw_flags{};
|
||||
std::memcpy(&raw_flags, &flags.proxy, sizeof(flags.proxy));
|
||||
auto it{block->PrependNewInst(insertion_point, op, {Value{args}...}, raw_flags)};
|
||||
it->SetParent(block);
|
||||
|
||||
@ -126,7 +126,8 @@ bool IsDataRingInstruction(const IR::Inst& inst) {
|
||||
}
|
||||
}
|
||||
|
||||
IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
|
||||
IR::Type BufferDataType(const IR::Inst& inst, const Profile& profile,
|
||||
AmdGpu::NumberFormat num_format) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadBufferU8:
|
||||
case IR::Opcode::StoreBufferU8:
|
||||
@ -144,7 +145,7 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
|
||||
return IR::Type::U64;
|
||||
case IR::Opcode::BufferAtomicFMax32:
|
||||
case IR::Opcode::BufferAtomicFMin32:
|
||||
return IR::Type::F32;
|
||||
return profile.supports_buffer_fp32_atomic_min_max ? IR::Type::F32 : IR::Type::U32;
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::StoreBufferFormatF32:
|
||||
// Formatted buffer loads can use a variety of types.
|
||||
@ -489,7 +490,8 @@ SharpLocation TrackSharp(const IR::Inst* inst, const IR::Block& current_parent,
|
||||
return SharpLocationFromSource(sources[0]);
|
||||
}
|
||||
|
||||
void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
|
||||
void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors,
|
||||
const Profile& profile) {
|
||||
IR::Inst* handle = inst.Arg(0).InstRecursive();
|
||||
u32 buffer_binding = 0;
|
||||
if (handle->AreAllArgsImmediates()) {
|
||||
@ -509,18 +511,19 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
|
||||
const auto buffer = std::bit_cast<AmdGpu::Buffer>(raw);
|
||||
buffer_binding = descriptors.Add(BufferResource{
|
||||
.sharp_idx = std::numeric_limits<u32>::max(),
|
||||
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
|
||||
.used_types = BufferDataType(inst, profile, buffer.GetNumberFmt()),
|
||||
.inline_cbuf = buffer,
|
||||
.buffer_type = BufferType::Guest,
|
||||
});
|
||||
} else {
|
||||
// Normal buffer resource.
|
||||
IR::Inst* buffer_handle = handle->Arg(0).InstRecursive();
|
||||
const auto sharp_idx = TrackSharp(buffer_handle, block);
|
||||
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
||||
const auto sharp_idx = TrackSharp(buffer_handle, block, inst_info.pc);
|
||||
const auto buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
|
||||
buffer_binding = descriptors.Add(BufferResource{
|
||||
.sharp_idx = sharp_idx,
|
||||
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
|
||||
.used_types = BufferDataType(inst, profile, buffer.GetNumberFmt()),
|
||||
.buffer_type = BufferType::Guest,
|
||||
.is_written = IsBufferStore(inst),
|
||||
.is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
|
||||
@ -662,7 +665,7 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
|
||||
}
|
||||
|
||||
void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info,
|
||||
Descriptors& descriptors) {
|
||||
Descriptors& descriptors, const Profile& profile) {
|
||||
const u32 binding = descriptors.Add(BufferResource{
|
||||
.used_types = IR::Type::U32,
|
||||
.inline_cbuf = AmdGpu::Buffer::Null(),
|
||||
@ -700,9 +703,13 @@ void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info,
|
||||
gds_addr = m0_val & 0xFFFF;
|
||||
}
|
||||
|
||||
// Patch instruction.
|
||||
inst.SetArg(0, ir.Imm32(gds_addr >> 2));
|
||||
inst.SetArg(1, ir.Imm32(binding));
|
||||
// Patch instruction to GDS buffer atomic increment/decrement.
|
||||
const IR::U32 handle = ir.Imm32(binding);
|
||||
const IR::U32 index = ir.Imm32(gds_addr >> 2);
|
||||
const bool is_append = inst.GetOpcode() == IR::Opcode::DataAppend;
|
||||
const IR::Value prev = is_append ? ir.BufferAtomicInc(handle, index, {})
|
||||
: ir.BufferAtomicDec(handle, index, {});
|
||||
inst.ReplaceUsesWithAndRemove(prev);
|
||||
} else {
|
||||
// Convert shared memory opcode to storage buffer atomic to GDS buffer.
|
||||
auto& buffer = info.buffers[binding];
|
||||
@ -1187,7 +1194,7 @@ void ResourceTrackingPass(IR::Program& program, const Profile& profile) {
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (IsBufferInstruction(inst)) {
|
||||
PatchBufferSharp(*block, inst, info, descriptors);
|
||||
PatchBufferSharp(*block, inst, info, descriptors, profile);
|
||||
} else if (IsImageInstruction(inst)) {
|
||||
PatchImageSharp(*block, inst, info, descriptors, profile);
|
||||
}
|
||||
@ -1202,7 +1209,7 @@ void ResourceTrackingPass(IR::Program& program, const Profile& profile) {
|
||||
} else if (IsImageInstruction(inst)) {
|
||||
PatchImageArgs(*block, inst, info);
|
||||
} else if (IsDataRingInstruction(inst)) {
|
||||
PatchGlobalDataShareAccess(*block, inst, info, descriptors);
|
||||
PatchGlobalDataShareAccess(*block, inst, info, descriptors, profile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -73,7 +73,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim
|
||||
const auto attrib =
|
||||
IR::Value{offset < 16 ? IR::Attribute::Position0
|
||||
: IR::Attribute::Param0 + (offset / 16 - 1)};
|
||||
const auto comp = (offset / 4) % 4;
|
||||
const u32 comp = (offset / 4) % 4;
|
||||
|
||||
inst.ReplaceOpcode(IR::Opcode::SetAttribute);
|
||||
inst.ClearArgs();
|
||||
|
||||
@ -49,15 +49,16 @@ union TextureInstInfo {
|
||||
};
|
||||
|
||||
union BufferInstInfo {
|
||||
u32 raw;
|
||||
BitField<0, 1, u32> index_enable;
|
||||
BitField<1, 1, u32> voffset_enable;
|
||||
BitField<2, 12, u32> inst_offset;
|
||||
BitField<14, 1, u32> system_coherent;
|
||||
BitField<15, 1, u32> globally_coherent;
|
||||
BitField<16, 1, u32> typed;
|
||||
u64 raw;
|
||||
BitField<0, 1, u64> index_enable;
|
||||
BitField<1, 1, u64> voffset_enable;
|
||||
BitField<2, 12, u64> inst_offset;
|
||||
BitField<14, 1, u64> system_coherent;
|
||||
BitField<15, 1, u64> globally_coherent;
|
||||
BitField<16, 1, u64> typed;
|
||||
BitField<17, 4, AmdGpu::DataFormat> inst_data_fmt;
|
||||
BitField<21, 3, AmdGpu::NumberFormat> inst_num_fmt;
|
||||
BitField<32, 16, u64> pc;
|
||||
};
|
||||
|
||||
enum class ScalarReg : u32 {
|
||||
|
||||
@ -200,7 +200,7 @@ public:
|
||||
void ReplaceOpcode(IR::Opcode opcode);
|
||||
|
||||
template <typename FlagsType>
|
||||
requires(sizeof(FlagsType) <= sizeof(u32) && std::is_trivially_copyable_v<FlagsType>)
|
||||
requires(sizeof(FlagsType) <= sizeof(u64) && std::is_trivially_copyable_v<FlagsType>)
|
||||
[[nodiscard]] FlagsType Flags() const noexcept {
|
||||
FlagsType ret;
|
||||
std::memcpy(reinterpret_cast<char*>(&ret), &flags, sizeof(ret));
|
||||
@ -208,7 +208,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename FlagsType>
|
||||
requires(sizeof(FlagsType) <= sizeof(u32) && std::is_trivially_copyable_v<FlagsType>)
|
||||
requires(sizeof(FlagsType) <= sizeof(u64) && std::is_trivially_copyable_v<FlagsType>)
|
||||
void SetFlags(FlagsType value) noexcept {
|
||||
std::memcpy(&flags, &value, sizeof(value));
|
||||
}
|
||||
@ -239,8 +239,8 @@ private:
|
||||
void ReplaceUsesWith(Value replacement, bool preserve);
|
||||
|
||||
IR::Opcode op{};
|
||||
u32 flags{};
|
||||
u32 definition{};
|
||||
u64 flags{};
|
||||
IR::Block* parent{};
|
||||
union {
|
||||
NonTriviallyDummy dummy{};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user