shader_recompiler: VS clip distance emulation for NVIDIA GPUs (#3958)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

This commit is contained in:
psucien 2026-01-26 21:17:51 +01:00 committed by GitHub
parent fa497f6bfd
commit 1e99c4b506
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 116 additions and 24 deletions

View File

@ -916,6 +916,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/profile.h
src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp
src/shader_recompiler/ir/passes/hull_shader_transform.cpp src/shader_recompiler/ir/passes/hull_shader_transform.cpp
src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/inject_clip_distance_attributes.cpp
src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
src/shader_recompiler/ir/passes/lower_fp64_to_fp32.cpp src/shader_recompiler/ir/passes/lower_fp64_to_fp32.cpp

View File

@ -204,6 +204,7 @@ add_subdirectory(tracy)
# pugixml # pugixml
if (NOT TARGET pugixml::pugixml) if (NOT TARGET pugixml::pugixml)
option(PUGIXML_NO_EXCEPTIONS "" ON)
add_subdirectory(pugixml) add_subdirectory(pugixml)
endif() endif()

View File

@ -364,7 +364,7 @@ void EmitContext::DefineInputs() {
} }
break; break;
} }
case LogicalStage::Fragment: case LogicalStage::Fragment: {
if (info.loads.GetAny(IR::Attribute::FragCoord)) { if (info.loads.GetAny(IR::Attribute::FragCoord)) {
frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
} }
@ -418,7 +418,13 @@ void EmitContext::DefineInputs() {
spv::StorageClass::Input); spv::StorageClass::Input);
} }
} }
for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
const bool has_clip_distance_inputs = runtime_info.fs_info.clip_distance_emulation;
// Clip distances attribute vector is the last in inputs array
const auto num_inputs =
runtime_info.fs_info.num_inputs - (has_clip_distance_inputs ? 1 : 0);
for (s32 i = 0; i < num_inputs; i++) {
const auto& input = runtime_info.fs_info.inputs[i]; const auto& input = runtime_info.fs_info.inputs[i];
if (input.IsDefault()) { if (input.IsDefault()) {
continue; continue;
@ -428,12 +434,13 @@ void EmitContext::DefineInputs() {
const auto [primary, auxiliary] = info.fs_interpolation[i]; const auto [primary, auxiliary] = info.fs_interpolation[i];
const Id type = F32[num_components]; const Id type = F32[num_components];
const Id attr_id = [&] { const Id attr_id = [&] {
const auto bind_location = input.param_index + (has_clip_distance_inputs ? 1 : 0);
if (primary == Qualifier::PerVertex && if (primary == Qualifier::PerVertex &&
profile.supports_fragment_shader_barycentric) { profile.supports_fragment_shader_barycentric) {
return Name(DefineInput(TypeArray(type, ConstU32(3U)), input.param_index), return Name(DefineInput(TypeArray(type, ConstU32(3U)), bind_location),
fmt::format("fs_in_attr{}_p", i)); fmt::format("fs_in_attr{}_p", i));
} }
return Name(DefineInput(type, input.param_index), fmt::format("fs_in_attr{}", i)); return Name(DefineInput(type, bind_location), fmt::format("fs_in_attr{}", i));
}(); }();
if (primary == Qualifier::PerVertex) { if (primary == Qualifier::PerVertex) {
Decorate(attr_id, profile.supports_amd_shader_explicit_vertex_parameter Decorate(attr_id, profile.supports_amd_shader_explicit_vertex_parameter
@ -450,7 +457,15 @@ void EmitContext::DefineInputs() {
input_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, input_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components,
false, false, primary == Qualifier::PerVertex); false, false, primary == Qualifier::PerVertex);
} }
if (has_clip_distance_inputs) {
const auto type = F32[MaxEmulatedClipDistances];
const auto attr_id = Name(DefineInput(type, 0), fmt::format("cldist_attr{}", 0));
input_params[num_inputs] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id,
MaxEmulatedClipDistances, false);
}
break; break;
}
case LogicalStage::Compute: case LogicalStage::Compute:
if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) || if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) ||
info.loads.GetAny(IR::Attribute::WorkgroupId)) { info.loads.GetAny(IR::Attribute::WorkgroupId)) {
@ -546,11 +561,16 @@ void EmitContext::DefineVertexBlock() {
const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value, const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value,
f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value}; f32_zero_value, f32_zero_value, f32_zero_value, f32_zero_value};
output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output);
if (info.stores.GetAny(IR::Attribute::ClipDistance)) { const bool needs_clip_distance_emulation = l_stage == LogicalStage::Vertex &&
const Id type{TypeArray(F32[1], ConstU32(8U))}; stage == Stage::Vertex &&
const Id initializer{ConstantComposite(type, zero)}; profile.needs_clip_distance_emulation;
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output, if (!needs_clip_distance_emulation) {
initializer); if (info.stores.GetAny(IR::Attribute::ClipDistance)) {
const Id type{TypeArray(F32[1], ConstU32(8U))};
const Id initializer{ConstantComposite(type, zero)};
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance,
spv::StorageClass::Output, initializer);
}
} }
if (info.stores.GetAny(IR::Attribute::CullDistance)) { if (info.stores.GetAny(IR::Attribute::CullDistance)) {
const Id type{TypeArray(F32[1], ConstU32(8U))}; const Id type{TypeArray(F32[1], ConstU32(8U))};
@ -583,16 +603,27 @@ void EmitContext::DefineOutputs() {
Name(output_attr_array, "out_attrs"); Name(output_attr_array, "out_attrs");
} }
} else { } else {
const auto has_clip_distance_outputs = info.stores.GetAny(IR::Attribute::ClipDistance);
u32 num_attrs = 0u;
for (u32 i = 0; i < IR::NumParams; i++) { for (u32 i = 0; i < IR::NumParams; i++) {
const IR::Attribute param{IR::Attribute::Param0 + i}; const IR::Attribute param{IR::Attribute::Param0 + i};
if (!info.stores.GetAny(param)) { if (!info.stores.GetAny(param)) {
continue; continue;
} }
const u32 num_components = info.stores.NumComponents(param); const u32 num_components = info.stores.NumComponents(param);
const Id id{DefineOutput(F32[num_components], i)}; const Id id{
DefineOutput(F32[num_components], i + (has_clip_distance_outputs ? 1 : 0))};
Name(id, fmt::format("out_attr{}", i)); Name(id, fmt::format("out_attr{}", i));
output_params[i] = output_params[i] =
GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true); GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true);
++num_attrs;
}
if (has_clip_distance_outputs) {
clip_distances = Id{DefineOutput(F32[MaxEmulatedClipDistances], 0)};
output_params[num_attrs] = GetAttributeInfo(
AmdGpu::NumberFormat::Float, clip_distances, MaxEmulatedClipDistances, true);
Name(clip_distances, fmt::format("cldist_attr{}", 0));
} }
} }
break; break;

View File

@ -101,7 +101,7 @@ std::string NameOf(Attribute attribute) {
case Attribute::Param31: case Attribute::Param31:
return "Param31"; return "Param31";
case Attribute::ClipDistance: case Attribute::ClipDistance:
return "ClipDistanace"; return "ClipDistance";
case Attribute::CullDistance: case Attribute::CullDistance:
return "CullDistance"; return "CullDistance";
case Attribute::RenderTargetIndex: case Attribute::RenderTargetIndex:

View File

@ -0,0 +1,41 @@
// SPDX-FileCopyrightText: Copyright 2026 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/program.h"
namespace Shader {
void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info) {
auto& info = runtime_info.fs_info;
if (!info.clip_distance_emulation || program.info.l_stage != LogicalStage::Fragment) {
return;
}
auto* first_block = *program.blocks.begin();
auto it = std::ranges::find_if(first_block->Instructions(), [](const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::Prologue;
});
ASSERT(it != first_block->end());
++it;
ASSERT(it != first_block->end());
++it;
IR::IREmitter ir{*first_block, it};
// We don't know how many clip distances are exported by VS as it is not processed at this point
// yet. Here is an assumption that we will have not more than 4 of them (while max is 8) to save
// one attributes export slot.
const auto attrib = IR::Attribute::Param0 + info.num_inputs;
for (u32 comp = 0; comp < MaxEmulatedClipDistances; ++comp) {
const auto attr_read = ir.GetAttribute(attrib, comp);
const auto cond_id = ir.FPLessThan(attr_read, ir.Imm32(0.0f));
ir.Discard(cond_id);
}
++info.num_inputs;
}
} // namespace Shader

View File

@ -8,7 +8,8 @@
namespace Shader { namespace Shader {
struct Profile; struct Profile;
} void InjectClipDistanceAttributes(IR::Program& program, RuntimeInfo& runtime_info);
} // namespace Shader
namespace Shader::Optimization { namespace Shader::Optimization {

View File

@ -41,7 +41,7 @@ struct Profile {
bool needs_lds_barriers{}; bool needs_lds_barriers{};
bool needs_buffer_offsets{}; bool needs_buffer_offsets{};
bool needs_unorm_fixup{}; bool needs_unorm_fixup{};
bool _pad0{}; bool needs_clip_distance_emulation{};
}; };
} // namespace Shader } // namespace Shader

View File

@ -13,17 +13,16 @@ namespace Shader {
IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
size_t num_syntax_blocks{}; size_t num_syntax_blocks{};
for (const auto& node : syntax_list) { for (const auto& [_, type] : syntax_list) {
if (node.type == IR::AbstractSyntaxNode::Type::Block) { if (type == IR::AbstractSyntaxNode::Type::Block) {
++num_syntax_blocks; ++num_syntax_blocks;
} }
} }
IR::BlockList blocks; IR::BlockList blocks{};
blocks.reserve(num_syntax_blocks); blocks.reserve(num_syntax_blocks);
u32 order_index{}; for (const auto& [data, type] : syntax_list) {
for (const auto& node : syntax_list) { if (type == IR::AbstractSyntaxNode::Type::Block) {
if (node.type == IR::AbstractSyntaxNode::Type::Block) { blocks.push_back(data.block);
blocks.push_back(node.data.block);
} }
} }
return blocks; return blocks;
@ -60,6 +59,10 @@ IR::Program TranslateProgram(const std::span<const u32>& code, Pools& pools, Inf
program.blocks = GenerateBlocks(program.syntax_list); program.blocks = GenerateBlocks(program.syntax_list);
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front()); program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
// On NVIDIA GPUs HW interpolation of clip distance values seems broken, and we need to emulate
// it with expensive discard in PS.
Shader::InjectClipDistanceAttributes(program, runtime_info);
// Run optimization passes // Run optimization passes
if (!profile.support_float64) { if (!profile.support_float64) {
Shader::Optimization::LowerFp64ToFp32(program); Shader::Optimization::LowerFp64ToFp32(program);

View File

@ -34,6 +34,7 @@ enum class LogicalStage : u32 {
}; };
constexpr u32 MaxStageTypes = static_cast<u32>(LogicalStage::NumLogicalStages); constexpr u32 MaxStageTypes = static_cast<u32>(LogicalStage::NumLogicalStages);
constexpr auto MaxEmulatedClipDistances = 4u;
constexpr Stage StageFromIndex(size_t index) noexcept { constexpr Stage StageFromIndex(size_t index) noexcept {
return static_cast<Stage>(index); return static_cast<Stage>(index);
@ -201,14 +202,16 @@ struct FragmentRuntimeInfo {
std::array<PsInput, 32> inputs; std::array<PsInput, 32> inputs;
std::array<PsColorBuffer, MaxColorBuffers> color_buffers; std::array<PsColorBuffer, MaxColorBuffers> color_buffers;
AmdGpu::ShaderExportFormat z_export_format; AmdGpu::ShaderExportFormat z_export_format;
u8 mrtz_mask; u8 mrtz_mask{};
bool dual_source_blending; bool dual_source_blending{false};
bool clip_distance_emulation{false};
bool operator==(const FragmentRuntimeInfo& other) const noexcept { bool operator==(const FragmentRuntimeInfo& other) const noexcept {
return std::ranges::equal(color_buffers, other.color_buffers) && return std::ranges::equal(color_buffers, other.color_buffers) &&
en_flags == other.en_flags && addr_flags == other.addr_flags && en_flags == other.en_flags && addr_flags == other.addr_flags &&
num_inputs == other.num_inputs && z_export_format == other.z_export_format && num_inputs == other.num_inputs && z_export_format == other.z_export_format &&
mrtz_mask == other.mrtz_mask && dual_source_blending == other.dual_source_blending && mrtz_mask == other.mrtz_mask && dual_source_blending == other.dual_source_blending &&
clip_distance_emulation == other.clip_distance_emulation &&
std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(), std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(),
other.inputs.begin() + num_inputs); other.inputs.begin() + num_inputs);
} }

View File

@ -101,7 +101,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
switch (stage) { switch (stage) {
case Stage::Local: { case Stage::Local: {
BuildCommon(regs.ls_program); BuildCommon(regs.ls_program);
Shader::TessellationDataConstantBuffer tess_constants; Shader::TessellationDataConstantBuffer tess_constants{};
const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)]; const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)];
hull_info->ReadTessConstantBuffer(tess_constants); hull_info->ReadTessConstantBuffer(tess_constants);
info.ls_info.ls_stride = tess_constants.ls_stride; info.ls_info.ls_stride = tess_constants.ls_stride;
@ -199,6 +199,10 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
for (u32 i = 0; i < Shader::MaxColorBuffers; i++) { for (u32 i = 0; i < Shader::MaxColorBuffers; i++) {
info.fs_info.color_buffers[i] = graphics_key.color_buffers[i]; info.fs_info.color_buffers[i] = graphics_key.color_buffers[i];
} }
info.fs_info.clip_distance_emulation =
regs.vs_output_control.clip_distance_enable &&
!regs.stage_enable.IsStageEnabled(static_cast<u32>(Stage::Local)) &&
profile.needs_clip_distance_emulation;
break; break;
} }
case Stage::Compute: { case Stage::Compute: {
@ -266,6 +270,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
instance.GetDriverID() == vk::DriverId::eMoltenvk, instance.GetDriverID() == vk::DriverId::eMoltenvk,
.needs_buffer_offsets = instance.StorageMinAlignment() > 4, .needs_buffer_offsets = instance.StorageMinAlignment() > 4,
.needs_unorm_fixup = instance.GetDriverID() == vk::DriverId::eMoltenvk, .needs_unorm_fixup = instance.GetDriverID() == vk::DriverId::eMoltenvk,
.needs_clip_distance_emulation = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
}; };
WarmUp(); WarmUp();
@ -460,7 +465,13 @@ bool PipelineCache::RefreshGraphicsStages() {
infos.fill(nullptr); infos.fill(nullptr);
modules.fill(nullptr); modules.fill(nullptr);
bind_stage(Stage::Fragment, LogicalStage::Fragment); const auto result = bind_stage(Stage::Fragment, LogicalStage::Fragment);
if (!result && regs.vs_output_control.clip_distance_enable &&
profile.needs_clip_distance_emulation) {
// TODO: need to implement a discard only fallback shader
LOG_WARNING(Render_Vulkan,
"Clip distance emulation is ineffective due to absense of fragment shader");
}
const auto* fs_info = infos[static_cast<u32>(LogicalStage::Fragment)]; const auto* fs_info = infos[static_cast<u32>(LogicalStage::Fragment)];
key.mrt_mask = fs_info ? fs_info->mrt_mask : 0u; key.mrt_mask = fs_info ? fs_info->mrt_mask : 0u;