mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2026-04-29 23:41:19 -06:00
336 lines
13 KiB
C++
336 lines
13 KiB
C++
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
#include "shader_recompiler/frontend/control_flow_graph.h"
|
|
#include "shader_recompiler/info.h"
|
|
#include "shader_recompiler/ir/basic_block.h"
|
|
#include "shader_recompiler/ir/breadth_first_search.h"
|
|
#include "shader_recompiler/ir/ir_emitter.h"
|
|
#include "shader_recompiler/ir/operand_helper.h"
|
|
#include "shader_recompiler/ir/program.h"
|
|
#include "shader_recompiler/ir/reinterpret.h"
|
|
#include "shader_recompiler/profile.h"
|
|
#include "video_core/amdgpu/resource.h"
|
|
|
|
namespace Shader::Optimization {
|
|
namespace {
|
|
|
|
using SharpLocation = u32;
|
|
|
|
bool IsBufferAtomic(const IR::Inst& inst) {
|
|
switch (inst.GetOpcode()) {
|
|
case IR::Opcode::BufferAtomicIAdd32:
|
|
case IR::Opcode::BufferAtomicIAdd64:
|
|
case IR::Opcode::BufferAtomicISub32:
|
|
case IR::Opcode::BufferAtomicSMin32:
|
|
case IR::Opcode::BufferAtomicSMin64:
|
|
case IR::Opcode::BufferAtomicUMin32:
|
|
case IR::Opcode::BufferAtomicUMin64:
|
|
case IR::Opcode::BufferAtomicFMin32:
|
|
case IR::Opcode::BufferAtomicSMax32:
|
|
case IR::Opcode::BufferAtomicSMax64:
|
|
case IR::Opcode::BufferAtomicUMax32:
|
|
case IR::Opcode::BufferAtomicUMax64:
|
|
case IR::Opcode::BufferAtomicFMax32:
|
|
case IR::Opcode::BufferAtomicInc32:
|
|
case IR::Opcode::BufferAtomicDec32:
|
|
case IR::Opcode::BufferAtomicAnd32:
|
|
case IR::Opcode::BufferAtomicOr32:
|
|
case IR::Opcode::BufferAtomicXor32:
|
|
case IR::Opcode::BufferAtomicSwap32:
|
|
case IR::Opcode::BufferAtomicCmpSwap32:
|
|
case IR::Opcode::BufferAtomicFCmpSwap32:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IsBufferStore(const IR::Inst& inst) {
|
|
switch (inst.GetOpcode()) {
|
|
case IR::Opcode::StoreBufferU8:
|
|
case IR::Opcode::StoreBufferU16:
|
|
case IR::Opcode::StoreBufferU32:
|
|
case IR::Opcode::StoreBufferU32x2:
|
|
case IR::Opcode::StoreBufferU32x3:
|
|
case IR::Opcode::StoreBufferU32x4:
|
|
case IR::Opcode::StoreBufferU64:
|
|
case IR::Opcode::StoreBufferF32:
|
|
case IR::Opcode::StoreBufferF32x2:
|
|
case IR::Opcode::StoreBufferF32x3:
|
|
case IR::Opcode::StoreBufferF32x4:
|
|
case IR::Opcode::StoreBufferFormatF32:
|
|
return true;
|
|
default:
|
|
return IsBufferAtomic(inst);
|
|
}
|
|
}
|
|
|
|
bool IsBufferInstruction(const IR::Inst& inst) {
|
|
switch (inst.GetOpcode()) {
|
|
case IR::Opcode::LoadBufferU8:
|
|
case IR::Opcode::LoadBufferU16:
|
|
case IR::Opcode::LoadBufferU32:
|
|
case IR::Opcode::LoadBufferU32x2:
|
|
case IR::Opcode::LoadBufferU32x3:
|
|
case IR::Opcode::LoadBufferU32x4:
|
|
case IR::Opcode::LoadBufferU64:
|
|
case IR::Opcode::LoadBufferF32:
|
|
case IR::Opcode::LoadBufferF32x2:
|
|
case IR::Opcode::LoadBufferF32x3:
|
|
case IR::Opcode::LoadBufferF32x4:
|
|
case IR::Opcode::LoadBufferFormatF32:
|
|
case IR::Opcode::ReadConstBuffer:
|
|
return true;
|
|
default:
|
|
return IsBufferStore(inst);
|
|
}
|
|
}
|
|
|
|
u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
|
|
switch (inst.GetOpcode()) {
|
|
case IR::Opcode::LoadBufferU8:
|
|
case IR::Opcode::StoreBufferU8:
|
|
return 0;
|
|
case IR::Opcode::LoadBufferU16:
|
|
case IR::Opcode::StoreBufferU16:
|
|
return 1;
|
|
case IR::Opcode::LoadBufferU64:
|
|
case IR::Opcode::StoreBufferU64:
|
|
case IR::Opcode::BufferAtomicIAdd64:
|
|
case IR::Opcode::BufferAtomicSMax64:
|
|
case IR::Opcode::BufferAtomicSMin64:
|
|
case IR::Opcode::BufferAtomicUMax64:
|
|
case IR::Opcode::BufferAtomicUMin64:
|
|
return 3;
|
|
case IR::Opcode::LoadBufferFormatF32:
|
|
case IR::Opcode::StoreBufferFormatF32: {
|
|
switch (data_format) {
|
|
case AmdGpu::DataFormat::Format8:
|
|
return 0;
|
|
case AmdGpu::DataFormat::Format8_8:
|
|
case AmdGpu::DataFormat::Format16:
|
|
return 1;
|
|
case AmdGpu::DataFormat::Format8_8_8_8:
|
|
case AmdGpu::DataFormat::Format16_16:
|
|
case AmdGpu::DataFormat::Format10_11_11:
|
|
case AmdGpu::DataFormat::Format2_10_10_10:
|
|
case AmdGpu::DataFormat::Format16_16_16_16:
|
|
case AmdGpu::DataFormat::Format32:
|
|
case AmdGpu::DataFormat::Format32_32:
|
|
case AmdGpu::DataFormat::Format32_32_32:
|
|
case AmdGpu::DataFormat::Format32_32_32_32:
|
|
return 2;
|
|
default:
|
|
return 0;
|
|
}
|
|
break;
|
|
}
|
|
case IR::Opcode::ReadConstBuffer:
|
|
// Provided address is already in dwords
|
|
return 0;
|
|
default:
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
class Descriptors {
|
|
public:
|
|
explicit Descriptors(Info& info_)
|
|
: info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images},
|
|
sampler_resources{info_.samplers}, fmask_resources(info_.fmasks) {}
|
|
|
|
u32 Add(const BufferResource& desc) {
|
|
const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
|
|
return desc.sharp_idx == existing.sharp_idx &&
|
|
desc.inline_cbuf == existing.inline_cbuf &&
|
|
desc.buffer_type == existing.buffer_type;
|
|
})};
|
|
auto& buffer = buffer_resources[index];
|
|
buffer.used_types |= desc.used_types;
|
|
buffer.is_written |= desc.is_written;
|
|
buffer.is_formatted |= desc.is_formatted;
|
|
return index;
|
|
}
|
|
|
|
u32 Add(const ImageResource& desc) {
|
|
const u32 index{Add(image_resources, desc, [&desc](const auto& existing) {
|
|
return desc.sharp_idx == existing.sharp_idx && desc.is_array == existing.is_array &&
|
|
desc.mip_fallback_mode == existing.mip_fallback_mode &&
|
|
desc.constant_mip_index == existing.constant_mip_index;
|
|
})};
|
|
auto& image = image_resources[index];
|
|
image.is_atomic |= desc.is_atomic;
|
|
image.is_written |= desc.is_written;
|
|
return index;
|
|
}
|
|
|
|
u32 Add(const SamplerResource& desc) {
|
|
const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
|
|
return desc.sharp_idx == existing.sharp_idx &&
|
|
desc.is_inline_sampler == existing.is_inline_sampler &&
|
|
desc.inline_sampler == existing.inline_sampler;
|
|
})};
|
|
return index;
|
|
}
|
|
|
|
u32 Add(const FMaskResource& desc) {
|
|
u32 index = Add(fmask_resources, desc, [&desc](const auto& existing) {
|
|
return desc.sharp_idx == existing.sharp_idx;
|
|
});
|
|
return index;
|
|
}
|
|
|
|
private:
|
|
template <typename Descriptors, typename Descriptor, typename Func>
|
|
static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) {
|
|
const auto it{std::ranges::find_if(descriptors, pred)};
|
|
if (it != descriptors.end()) {
|
|
return static_cast<u32>(std::distance(descriptors.begin(), it));
|
|
}
|
|
descriptors.push_back(desc);
|
|
return static_cast<u32>(descriptors.size()) - 1;
|
|
}
|
|
|
|
const Info& info;
|
|
BufferResourceList& buffer_resources;
|
|
ImageResourceList& image_resources;
|
|
SamplerResourceList& sampler_resources;
|
|
FMaskResourceList& fmask_resources;
|
|
};
|
|
|
|
} // Anonymous namespace
|
|
|
|
void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors,
|
|
const Profile& profile) {
|
|
u32 buffer_binding = descriptors.Add(BufferResource{.sharp_idx = 0,
|
|
.used_types = IR::Type::U32,
|
|
.buffer_type = BufferType::Guest,
|
|
.is_written = true,
|
|
.is_formatted = false});
|
|
|
|
// Replace handle with binding index in buffer resource list.
|
|
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
|
inst.SetArg(0, ir.Imm32(buffer_binding));
|
|
}
|
|
|
|
IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
|
|
const AmdGpu::Buffer& buffer, u32 stride) {
|
|
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
|
const u32 inst_offset = inst_info.inst_offset.Value();
|
|
const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
|
|
const auto data_format = is_inst_typed
|
|
? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
|
|
: buffer.GetDataFmt();
|
|
const u32 shift = BufferAddressShift(inst, data_format);
|
|
const u32 mask = (1 << shift) - 1;
|
|
const IR::U32 soffset = IR::GetBufferSOffsetArg(&inst);
|
|
|
|
// If address calculation is of the form "index * const_stride + offset" with
|
|
// offset constant and both const_stride and offset are divisible with the
|
|
// element size, apply shift directly.
|
|
if (inst_info.index_enable && !inst_info.voffset_enable && soffset.IsImmediate() &&
|
|
!buffer.swizzle_enable && !buffer.add_tid_enable && (stride & mask) == 0) {
|
|
const u32 total_offset = soffset.U32() + inst_offset;
|
|
if ((total_offset & mask) == 0) {
|
|
// buffer_offset = index * (const_stride >> shift) + (offset >> shift)
|
|
const IR::U32 index = IR::GetBufferIndexArg(&inst);
|
|
return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)),
|
|
ir.Imm32(total_offset >> shift));
|
|
}
|
|
}
|
|
|
|
// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ?
|
|
// thread_id[5:0] : 0)
|
|
IR::U32 index = ir.Imm32(0U);
|
|
if (inst_info.index_enable) {
|
|
const IR::U32 vgpr_index = IR::GetBufferIndexArg(&inst);
|
|
index = ir.IAdd(index, vgpr_index);
|
|
}
|
|
if (buffer.add_tid_enable) {
|
|
ASSERT_MSG(info.l_stage == LogicalStage::Compute,
|
|
"Thread ID buffer addressing is not supported outside of compute.");
|
|
const IR::U32 thread_id{ir.LaneId()};
|
|
index = ir.IAdd(index, thread_id);
|
|
}
|
|
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
|
|
IR::U32 offset = ir.Imm32(inst_offset);
|
|
offset = ir.IAdd(offset, soffset);
|
|
if (inst_info.voffset_enable) {
|
|
const IR::U32 voffset = IR::GetBufferVOffsetArg(&inst);
|
|
offset = ir.IAdd(offset, voffset);
|
|
}
|
|
const IR::U32 const_stride = ir.Imm32(stride);
|
|
IR::U32 buffer_offset;
|
|
if (buffer.swizzle_enable) {
|
|
const IR::U32 const_index_stride = ir.Imm32(buffer.GetIndexStride());
|
|
const IR::U32 const_element_size = ir.Imm32(buffer.GetElementSize());
|
|
// index_msb = index / const_index_stride
|
|
const IR::U32 index_msb{ir.IDiv(index, const_index_stride)};
|
|
// index_lsb = index % const_index_stride
|
|
const IR::U32 index_lsb{ir.IMod(index, const_index_stride)};
|
|
// offset_msb = offset / const_element_size
|
|
const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)};
|
|
// offset_lsb = offset % const_element_size
|
|
const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)};
|
|
// buffer_offset =
|
|
// (index_msb * const_stride + offset_msb * const_element_size) *
|
|
// const_index_stride
|
|
// + index_lsb * const_element_size + offset_lsb
|
|
const IR::U32 buffer_offset_msb = ir.IMul(
|
|
ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)),
|
|
const_index_stride);
|
|
const IR::U32 buffer_offset_lsb =
|
|
ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb);
|
|
buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb);
|
|
} else {
|
|
// buffer_offset = index * const_stride + offset
|
|
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
|
|
}
|
|
if (shift != 0) {
|
|
buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
|
|
}
|
|
return buffer_offset;
|
|
}
|
|
|
|
void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
|
|
const auto handle = inst.Arg(0);
|
|
const auto buffer_res = info.buffers[handle.U32()];
|
|
const auto buffer = AmdGpu::Buffer::Null();
|
|
|
|
// Address of constant buffer reads can be calculated at IR emission time.
|
|
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) {
|
|
return;
|
|
}
|
|
|
|
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
|
inst.SetArg(IR::LoadBufferArgs::Address,
|
|
CalculateBufferAddress(ir, inst, info, buffer, buffer.stride));
|
|
}
|
|
|
|
void ResourceTrackingPassStub(IR::Program& program, const Profile& profile) {
|
|
// Iterate resource instructions and patch them after finding the sharp.
|
|
auto& info = program.info;
|
|
|
|
// Pass 1: Track resource sharps
|
|
Descriptors descriptors{info};
|
|
for (IR::Block* const block : program.blocks) {
|
|
for (IR::Inst& inst : block->Instructions()) {
|
|
if (IsBufferInstruction(inst)) {
|
|
PatchBufferSharp(*block, inst, info, descriptors, profile);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pass 2: Patch instruction args
|
|
for (IR::Block* const block : program.blocks) {
|
|
for (IR::Inst& inst : block->Instructions()) {
|
|
if (IsBufferInstruction(inst)) {
|
|
PatchBufferArgs(*block, inst, info);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace Shader::Optimization
|