diff --git a/rpcs3/CMakeLists.txt b/rpcs3/CMakeLists.txt index 796351e16c..02a6bfeed4 100644 --- a/rpcs3/CMakeLists.txt +++ b/rpcs3/CMakeLists.txt @@ -193,6 +193,7 @@ if(BUILD_RPCS3_TESTS) tests/test_simple_array.cpp tests/test_address_range.cpp tests/test_rsx_cfg.cpp + tests/test_rsx_fp_asm.cpp ) target_link_libraries(rpcs3_test diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 5feef8f8ca..057c7a5489 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -518,12 +518,15 @@ target_sources(rpcs3_emu PRIVATE RSX/Overlays/overlay_video.cpp RSX/Overlays/Shaders/shader_loading_dialog.cpp RSX/Overlays/Shaders/shader_loading_dialog_native.cpp + RSX/Program/Assembler/FPASM.cpp + RSX/Program/Assembler/FPOpcodes.cpp RSX/Program/Assembler/FPToCFG.cpp + RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp + RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp RSX/Program/CgBinaryProgram.cpp RSX/Program/CgBinaryFragmentProgram.cpp RSX/Program/CgBinaryVertexProgram.cpp RSX/Program/FragmentProgramDecompiler.cpp - RSX/Program/FragmentProgramRegister.cpp RSX/Program/GLSLCommon.cpp RSX/Program/ProgramStateCache.cpp RSX/Program/program_util.cpp diff --git a/rpcs3/Emu/RSX/Common/simple_array.hpp b/rpcs3/Emu/RSX/Common/simple_array.hpp index 00dd6e7d95..6852e670fb 100644 --- a/rpcs3/Emu/RSX/Common/simple_array.hpp +++ b/rpcs3/Emu/RSX/Common/simple_array.hpp @@ -337,7 +337,7 @@ namespace rsx AUDIT(_loc < _size); const auto remaining = (_size - _loc); - memmove(pos + 1, pos, remaining * sizeof(Ty)); + std::memmove(pos + 1, pos, remaining * sizeof(Ty)); *pos = val; _size++; @@ -365,7 +365,7 @@ namespace rsx AUDIT(_loc < _size); const u32 remaining = (_size - _loc); - memmove(pos + 1, pos, remaining * sizeof(Ty)); + std::memmove(pos + 1, pos, remaining * sizeof(Ty)); *pos = val; _size++; @@ -373,6 +373,31 @@ namespace rsx return pos; } + iterator insert(iterator where, span_like auto const& values) + { + ensure(where >= _data); + const auto _loc = offset(where); + const auto in_size = static_cast(values.size()); + const auto in_size_bytes = in_size * sizeof(Ty); + + reserve(_size + in_size); + + if (_loc >= _size) + { + where = _data + _size; + std::memcpy(where, values.data(), in_size_bytes); + _size += in_size; + return where; + } + + const u32 remaining_bytes = (_size - _loc) * sizeof(Ty); + where = _data + _loc; + std::memmove(where + in_size, where, remaining_bytes); + std::memmove(where, values.data(), in_size_bytes); + _size += in_size; + return where; + } + void operator += (const rsx::simple_array& that) { const auto old_size = _size; diff --git a/rpcs3/Emu/RSX/Program/Assembler/CFG.h b/rpcs3/Emu/RSX/Program/Assembler/CFG.h index 9bc44a22d1..818bc2a018 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/CFG.h +++ b/rpcs3/Emu/RSX/Program/Assembler/CFG.h @@ -34,6 +34,11 @@ namespace rsx::assembler } }; + struct CFGPass + { + virtual void run(FlowGraph& graph) = 0; + }; + FlowGraph deconstruct_fragment_program(const RSXFragmentProgram& prog); } diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp new file mode 100644 index 0000000000..2d74fafc73 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp @@ -0,0 +1,455 @@ +#include "stdafx.h" +#include "FPASM.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include + +#ifndef _WIN32 +#define sscanf_s sscanf +#endif + +namespace rsx::assembler +{ + struct FP_opcode_encoding_t + { + FP_opcode op; + bool exec_if_lt; + bool exec_if_eq; + bool exec_if_gt; + bool set_cond; + }; + + static std::unordered_map s_opcode_lookup + { + // Arithmetic + { "NOP", { .op = RSX_FP_OPCODE_NOP, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "MOV", { .op = RSX_FP_OPCODE_MOV, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "MUL", { .op = RSX_FP_OPCODE_MUL, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "ADD", { .op = RSX_FP_OPCODE_ADD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "MAD", { .op = RSX_FP_OPCODE_MAD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "FMA", { .op = RSX_FP_OPCODE_MAD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "DP3", { .op = RSX_FP_OPCODE_DP3, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "DP4", { .op = RSX_FP_OPCODE_DP4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + + // Constant load + { "SFL", {.op = RSX_FP_OPCODE_SFL, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "STR", {.op = RSX_FP_OPCODE_STR, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + + // Pack-unpack operations are great for testing dependencies + { "PKH", { .op = RSX_FP_OPCODE_PK2, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "UPH", { .op = RSX_FP_OPCODE_UP2, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "PK16U", { .op = RSX_FP_OPCODE_PK16, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "UP16U", { .op = RSX_FP_OPCODE_UP16, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "PK8U", { .op = RSX_FP_OPCODE_PKB, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "UP8U", { .op = RSX_FP_OPCODE_UPB, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "PK8G", { .op = RSX_FP_OPCODE_PKG, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "UP8G", { .op = RSX_FP_OPCODE_UPG, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "PK8S", { .op = RSX_FP_OPCODE_PK4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "UP8S", { .op = RSX_FP_OPCODE_UP4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + + // Basic conditionals + { "IF.LT", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = true, .exec_if_eq = false, .exec_if_gt = false, .set_cond = false } }, + { "IF.LE", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = false, .set_cond = false } }, + { "IF.EQ", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = true, .exec_if_gt = false, .set_cond = false } }, + { "IF.GE", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } }, + { "IF.GT", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = true, .set_cond = false } }, + + { "SLT", { .op = RSX_FP_OPCODE_SLT, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } }, + { "SEQ", { .op = RSX_FP_OPCODE_SEQ, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } }, + { "SGT", { .op = RSX_FP_OPCODE_SGT, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } }, + + // TODO: Add more + + }; + + Instruction* FPIR::load(const RegisterRef& ref, int operand, Instruction* prev) + { + Instruction* target = prev; + if (!target) + { + m_instructions.push_back({}); + target = &m_instructions.back(); + } + + SRC_Common src{ .HEX = target->bytecode[operand + 1] }; + src.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src.fp16 = ref.reg.f16 ? 1 : 0; + src.tmp_reg_index = static_cast(ref.reg.id); + + src.swizzle_x = 0; + src.swizzle_y = 1; + src.swizzle_z = 2; + src.swizzle_w = 3; + + target->bytecode[operand + 1] = src.HEX; + return target; + } + + Instruction* FPIR::load(const std::array& constants, int operand, Instruction* prev) + { + Instruction* target = prev; + if (!target) + { + m_instructions.push_back({}); + target = &m_instructions.back(); + } + + // Unsupported for now + ensure(target->length == 4, "FPIR cannot encode more than one constant load per instruction"); + + SRC_Common src{ .HEX = target->bytecode[operand + 1] }; + src.reg_type = RSX_FP_REGISTER_TYPE_CONSTANT; + target->bytecode[operand + 1] = src.HEX; + + src.swizzle_x = 0; + src.swizzle_y = 1; + src.swizzle_z = 2; + src.swizzle_w = 3; + + // Embed literal constant + std::memcpy(&target->bytecode[4], constants.data(), 4 * sizeof(u32)); + target->length = 8; + return target; + } + + Instruction* FPIR::store(const RegisterRef& ref, Instruction* prev) + { + Instruction* target = prev; + if (!target) + { + m_instructions.push_back({}); + target = &m_instructions.back(); + } + + OPDEST dst{ .HEX = target->bytecode[0] }; + dst.dest_reg = static_cast(ref.reg.id); + dst.fp16 = ref.reg.f16 ? 1 : 0; + dst.write_mask = ref.mask; + dst.prec = ref.reg.f16 ? RSX_FP_PRECISION_HALF : RSX_FP_PRECISION_REAL; + + target->bytecode[0] = dst.HEX; + return target; + } + + void FPIR::mov(const RegisterRef& dst, f32 constant) + { + Instruction* inst = store(dst); + inst = load(std::array{ constant, constant, constant, constant }, 0); + inst->opcode = RSX_FP_OPCODE_MOV; + } + + void FPIR::mov(const RegisterRef& dst, const RegisterRef& src) + { + Instruction* inst = store(dst); + inst = load(src, 0); + inst->opcode = RSX_FP_OPCODE_MOV; + } + + void FPIR::add(const RegisterRef& dst, const std::array& constants) + { + Instruction* inst = store(dst); + inst = load(constants, 0); + inst->opcode = RSX_FP_OPCODE_ADD; + } + + void FPIR::add(const RegisterRef& dst, const RegisterRef& src) + { + Instruction* inst = store(dst); + inst = load(src, 0); + inst->opcode = RSX_FP_OPCODE_ADD; + } + + const std::vector& FPIR::instructions() const + { + return m_instructions; + } + + std::vector FPIR::compile() const + { + std::vector result; + result.reserve(m_instructions.size() * 4); + + for (const auto& inst : m_instructions) + { + const auto src = reinterpret_cast*>(inst.bytecode); + for (u32 j = 0; j < inst.length; ++j) + { + const u16 low = src[j * 2]; + const u16 hi = src[j * 2 + 1]; + const u32 word = static_cast(low) | (static_cast(hi) << 16u); + result.push_back(word); + } + } + + return result; + } + + FPIR FPIR::from_source(std::string_view asm_) + { + std::vector instructions = fmt::split(asm_, { "\n", ";" }); + if (instructions.empty()) + { + return {}; + } + + auto transform_inst = [](std::string_view s) + { + std::string result; + result.reserve(s.size()); + + bool literal = false; + for (const auto& c : s) + { + if (c == ' ') + { + if (!literal && !result.empty() && result.back() != ',') + { + result += ','; // Replace token separator space with comma + } + continue; + } + + if (std::isspace(c)) + { + continue; + } + + if (!literal && c == '{') + { + literal = true; + } + + if (literal && c == '}') + { + literal = false; + } + + if (c == ',') + { + result += (literal ? '|' : ','); + continue; + } + + result += c; + } + return result; + }; + + auto decode_instruction = [&](std::string_view inst, std::string& op, std::string& dst, std::vector& sources) + { + const auto i = transform_inst(inst); + if (i.empty()) + { + return; + } + + const auto tokens = fmt::split(i, { "," }); + ensure(!tokens.empty(), "Invalid input"); + + op = tokens.front(); + + if (tokens.size() > 1) + { + dst = tokens[1]; + } + + for (size_t n = 2; n < tokens.size(); ++n) + { + sources.push_back(tokens[n]); + } + }; + + auto get_ref = [](std::string_view reg) + { + ensure(reg.length() > 1, "Invalid register specifier"); + + const auto parts = fmt::split(reg, { "." }); + ensure(parts.size() > 0 && parts.size() <= 2); + + const auto index = std::stoi(parts[0].substr(1)); + RegisterRef ref + { + .reg { .id = index, .f16 = false }, + .mask = 0x0F + }; + + if (parts.size() > 1 && parts[1].length() > 0) + { + // FIXME: No swizzles for now, just lane masking + ref.mask = 0; + if (parts[1].find("x") != std::string::npos) ref.mask |= (1u << 0); + if (parts[1].find("y") != std::string::npos) ref.mask |= (1u << 1); + if (parts[1].find("z") != std::string::npos) ref.mask |= (1u << 2); + if (parts[1].find("w") != std::string::npos) ref.mask |= (1u << 3); + } + + if (reg[0] == 'H' || reg[0] == 'h') + { + ref.reg.f16 = true; + } + + return ref; + }; + + auto get_constants = [](std::string_view reg) -> std::array + { + float x, y, z, w; + if (sscanf_s(reg.data(), "#{%f|%f|%f|%f}", &x, &y, &z, &w) == 4) + { + return { x, y, z, w }; + } + + if (sscanf_s(reg.data(), "#{%f}", &x) == 1) + { + return { x, x, x, x }; + } + + fmt::throw_exception("Invalid constant literal"); + }; + + auto encode_branch_else = [](Instruction* inst, u32 end) + { + SRC1 src1{ .HEX = inst->bytecode[2] }; + src1.else_offset = static_cast(end); + inst->bytecode[2] = src1.HEX; + }; + + auto encode_branch_end = [](Instruction *inst, u32 end) + { + SRC2 src2 { .HEX = inst->bytecode[3] }; + src2.end_offset = static_cast(end); + inst->bytecode[3] = src2.HEX; + + SRC1 src1{ .HEX = inst->bytecode[2] }; + if (!src1.else_offset) + { + src1.else_offset = static_cast(end); + inst->bytecode[2] = src1.HEX; + } + }; + + auto encode_opcode = [](std::string_view op, Instruction* inst) + { + OPDEST d0 { .HEX = inst->bytecode[0] }; + SRC0 s0 { .HEX = inst->bytecode[1] }; + SRC1 s1 { .HEX = inst->bytecode[2] }; + + const auto found = s_opcode_lookup.find(op); + if (found == s_opcode_lookup.end()) + { + fmt::throw_exception("Unhandled instruction '%s'", op); + } + const auto& encoding = found->second; + + inst->opcode = encoding.op; + d0.opcode = encoding.op & 0x3F; + s1.opcode_hi = (encoding.op > 0x3F)? 1 : 0; + s0.exec_if_eq = encoding.exec_if_eq ? 1 : 0; + s0.exec_if_gr = encoding.exec_if_gt ? 1 : 0; + s0.exec_if_lt = encoding.exec_if_lt ? 1 : 0; + d0.set_cond = encoding.set_cond ? 1 : 0; + inst->bytecode[0] = d0.HEX; + inst->bytecode[1] = s0.HEX; + inst->bytecode[2] = s1.HEX; + }; + + std::string op, dst; + std::vector sources; + + std::stack if_ops; + std::stack loop_ops; + u32 pc = 0; + + FPIR ir{}; + + for (const auto& instruction : instructions) + { + op.clear(); + dst.clear(); + sources.clear(); + decode_instruction(instruction, op, dst, sources); + + if (op.empty()) + { + continue; + } + + if (op.starts_with("IF.")) + { + if_ops.push(ir.m_instructions.size()); + } + else if (op == "LOOP") + { + loop_ops.push(ir.m_instructions.size()); + } + else if (op == "ELSE") + { + ensure(!if_ops.empty()); + encode_branch_else(&ir.m_instructions[if_ops.top()], pc); + continue; + } + else if (op == "ENDIF") + { + ensure(!if_ops.empty()); + encode_branch_end(&ir.m_instructions[if_ops.top()], pc); + if_ops.pop(); + continue; + } + else if (op == "ENDLOOP") + { + ensure(!loop_ops.empty()); + encode_branch_end(&ir.m_instructions[loop_ops.top()], pc); + loop_ops.pop(); + continue; + } + + ir.m_instructions.push_back({}); + Instruction* target = &ir.m_instructions.back(); + pc += 4; + + encode_opcode(op, target); + ensure(sources.size() == FP::get_operand_count(static_cast(target->opcode)), "Invalid operand count for opcode"); + + if (dst.empty()) + { + OPDEST dst{ .HEX = target->bytecode[0] }; + dst.no_dest = 1; + target->bytecode[0] = dst.HEX; + } + else + { + ir.store(get_ref(dst), target); + } + + int operand = 0; + bool has_literal = false; + for (const auto& source : sources) + { + if (source.front() == '#') + { + const auto literal = get_constants(source); + ir.load(literal, operand++, target); + has_literal = true; + continue; + } + + ir.load(get_ref(source), operand++, target); + } + + if (has_literal) + { + pc += 4; + } + } + + if (!ir.m_instructions.empty()) + { + OPDEST d0{ .HEX = ir.m_instructions.back().bytecode[0] }; + d0.end = 1; + + ir.m_instructions.back().bytecode[0] = d0.HEX; + } + + return ir; + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPASM.h b/rpcs3/Emu/RSX/Program/Assembler/FPASM.h new file mode 100644 index 0000000000..83fc2fb6b1 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPASM.h @@ -0,0 +1,29 @@ +#pragma once + +#include "IR.h" + +namespace rsx::assembler +{ + class FPIR + { + public: + void mov(const RegisterRef& dst, f32 constant); + void mov(const RegisterRef& dst, const RegisterRef& src); + + void add(const RegisterRef& dst, const std::array& constants); + void add(const RegisterRef& dst, const RegisterRef& src); + + const std::vector& instructions() const; + std::vector compile() const; + + static FPIR from_source(std::string_view asm_); + + private: + Instruction* load(const RegisterRef& reg, int operand, Instruction* target = nullptr); + Instruction* load(const std::array& constants, int operand, Instruction* target = nullptr); + Instruction* store(const RegisterRef& reg, Instruction* target = nullptr); + + std::vector m_instructions; + }; +} + diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp new file mode 100644 index 0000000000..2dc58a471b --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp @@ -0,0 +1,428 @@ +#include "stdafx.h" +#include "FPOpcodes.h" + +#include "Emu/RSX/Common/simple_array.hpp" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include + +namespace rsx::assembler::FP +{ + u8 get_operand_count(FP_opcode opcode) + { + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + return 1; + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + return 2; + case RSX_FP_OPCODE_MAD: + return 3; + case RSX_FP_OPCODE_DP3: + case RSX_FP_OPCODE_DP4: + return 2; + case RSX_FP_OPCODE_DST: + return 2; + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return 2; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return 2; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return 1; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + case RSX_FP_OPCODE_UP4: + return 1; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return 1; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + case RSX_FP_OPCODE_TXP: + return 1; + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return 1; + case RSX_FP_OPCODE_LIT: + return 1; + case RSX_FP_OPCODE_LRP: + return 3; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return 0; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return 1; + case RSX_FP_OPCODE_PK2: + case RSX_FP_OPCODE_UP2: + return 1; + case RSX_FP_OPCODE_PKB: + case RSX_FP_OPCODE_UPB: + case RSX_FP_OPCODE_PK16: + case RSX_FP_OPCODE_UP16: + case RSX_FP_OPCODE_PKG: + case RSX_FP_OPCODE_UPG: + return 1; + case RSX_FP_OPCODE_DP2A: + return 3; + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return 2; + case RSX_FP_OPCODE_DP2: + return 2; + case RSX_FP_OPCODE_NRM: + return 1; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return 2; + case RSX_FP_OPCODE_LIF: + return 1; + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + + // The rest are unimplemented and not encountered in real software. + // TODO: Probe these on real PS3 and figure out what they actually do. + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused + case RSX_FP_OPCODE_BEM: + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEM class instruction"); // Unused + case RSX_FP_OPCODE_REFL: + return 2; + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + default: + break; + } + + return 0; + } + + // Returns a lane mask for the given operand. + // The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel. + u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + constexpr u32 x = 0b0001; + constexpr u32 y = 0b0010; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + constexpr u32 xy = 0b0011; + constexpr u32 xyz = 0b0111; + constexpr u32 xyzw = 0b1111; + + const auto decode = [&](const rsx::simple_array& masks) -> u32 + { + return operand < masks.size() + ? masks[operand] + : 0u; + }; + + auto opcode = static_cast(instruction->opcode); + if (operand >= get_operand_count(opcode)) + { + return 0; + } + + OPDEST d0 { .HEX = instruction->bytecode[0] }; + const u32 dst_write_mask = d0.no_dest ? 0 : d0.write_mask; + + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + case RSX_FP_OPCODE_MAD: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_DP3: + return xyz; + case RSX_FP_OPCODE_DP4: + return xyzw; + case RSX_FP_OPCODE_DST: + return decode({ y | z, y | w }); + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + return xyzw; + case RSX_FP_OPCODE_UP4: + return x; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + switch (prog.get_texture_dimension(d0.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return x; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyz; + default: + return 0; + } + case RSX_FP_OPCODE_TXP: + switch (prog.get_texture_dimension(d0.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xyz; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyzw; + default: + return 0; + } + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return x; + case RSX_FP_OPCODE_LIT: + return xyzw; + case RSX_FP_OPCODE_LRP: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return x; + case RSX_FP_OPCODE_PK2: + return xy; + case RSX_FP_OPCODE_UP2: + return x; + case RSX_FP_OPCODE_PKB: + return xyzw; + case RSX_FP_OPCODE_UPB: + return x; + case RSX_FP_OPCODE_PK16: + return xy; + case RSX_FP_OPCODE_UP16: + return x; + case RSX_FP_OPCODE_PKG: + return xyzw; + case RSX_FP_OPCODE_UPG: + return x; + case RSX_FP_OPCODE_DP2A: + return decode({ xy, xy, x }); + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return decode({ xy, x }); + case RSX_FP_OPCODE_REFL: + return xyzw; + case RSX_FP_OPCODE_DP2: + return xy; + case RSX_FP_OPCODE_NRM: + return xyz; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return decode({ xyzw, x }) & dst_write_mask; + case RSX_FP_OPCODE_LIF: + return decode({ y | w }); + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused ?? + case RSX_FP_OPCODE_BEM: + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEM class instruction"); // Unused + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + default: + break; + } + + return 0; + } + + // Resolved vector lane mask with swizzles applied. + u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + // Brute-force this. There's only 16 permutations. + constexpr u32 x = 0b0001; + constexpr u32 y = 0b0010; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + + const u32 lane_mask = get_src_vector_lane_mask(prog, instruction, operand); + if (!lane_mask) + { + return lane_mask; + } + + // Now we resolve matching lanes. + // This sequence can be drastically sped up using lookup tables but that will come later. + std::unordered_set inputs; + SRC_Common src { .HEX = instruction->bytecode[operand + 1] }; + + if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP) + { + return 0; + } + + if (lane_mask & x) inputs.insert(src.swizzle_x); + if (lane_mask & y) inputs.insert(src.swizzle_y); + if (lane_mask & z) inputs.insert(src.swizzle_z); + if (lane_mask & w) inputs.insert(src.swizzle_w); + + u32 result = 0; + if (inputs.contains(0)) result |= x; + if (inputs.contains(1)) result |= y; + if (inputs.contains(2)) result |= z; + if (inputs.contains(3)) result |= w; + + return result; + } + + bool is_delay_slot(const Instruction* instruction) + { + OPDEST dst { .HEX = instruction->bytecode[0] }; + SRC0 src0 { .HEX = instruction->bytecode[1] }; + SRC1 src1{ .HEX = instruction->bytecode[2] }; + + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self + dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter + dst.saturate || // Precision modifier + (dst.prec != RSX_FP_PRECISION_REAL && + dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers + { + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; + } + + RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + SRC_Common src{ .HEX = instruction->bytecode[operand + 1] }; + if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP) + { + return {}; + } + + const u32 read_lanes = get_src_vector_lane_mask_shuffled(prog, instruction, operand); + if (!read_lanes) + { + return {}; + } + + RegisterRef ref{ .mask = read_lanes }; + Register& reg = ref.reg; + + reg.f16 = !!src.fp16; + reg.id = src.tmp_reg_index; + return ref; + } + + RegisterRef get_dst_register(const Instruction* instruction) + { + OPDEST dst { .HEX = instruction->bytecode[0] }; + if (dst.no_dest) + { + return {}; + } + + RegisterRef ref{ .mask = dst.write_mask }; + ref.reg.f16 = dst.fp16; + ref.reg.id = dst.dest_reg; + return ref; + } + + // Convert vector mask to file range + rsx::simple_array get_register_file_range(const RegisterRef& reg) + { + if (!reg.mask) + { + return {}; + } + + constexpr u32 register_file_max_len = 48 * 8; // H0 - H47, R0 - R23 + + const u32 lane_width = reg.reg.f16 ? 2 : 4; + const u32 file_offset = reg.reg.id * lane_width * 4; + + ensure(file_offset < register_file_max_len, "Invalid register index"); + + rsx::simple_array result{}; + auto insert_lane = [&](u32 word_offset) + { + for (u32 i = 0; i < lane_width; ++i) + { + result.push_back(file_offset + (word_offset * lane_width) + i); + } + }; + + if (reg.x) insert_lane(0); + if (reg.y) insert_lane(1); + if (reg.z) insert_lane(2); + if (reg.w) insert_lane(3); + + return result; + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h new file mode 100644 index 0000000000..4e7f65f22b --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h @@ -0,0 +1,111 @@ +#pragma once + +#include "IR.h" +#include "Emu/RSX/Common/simple_array.hpp" + +struct RSXFragmentProgram; + +namespace rsx::assembler +{ + enum FP_opcode + { + RSX_FP_OPCODE_NOP = 0x00, // No-Operation + RSX_FP_OPCODE_MOV = 0x01, // Move + RSX_FP_OPCODE_MUL = 0x02, // Multiply + RSX_FP_OPCODE_ADD = 0x03, // Add + RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add + RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product + RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product + RSX_FP_OPCODE_DST = 0x07, // Distance + RSX_FP_OPCODE_MIN = 0x08, // Minimum + RSX_FP_OPCODE_MAX = 0x09, // Maximum + RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan + RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual + RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual + RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan + RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual + RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal + RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract) + RSX_FP_OPCODE_FLR = 0x11, // Floor + RSX_FP_OPCODE_KIL = 0x12, // Kill fragment + RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values + RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values + RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x) + RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y) + RSX_FP_OPCODE_TEX = 0x17, // Texture lookup + RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup) + RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives) + RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal + RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root + RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2 + RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2 + RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients + RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation + RSX_FP_OPCODE_STR = 0x20, // Set-If-True + RSX_FP_OPCODE_SFL = 0x21, // Set-If-False + RSX_FP_OPCODE_COS = 0x22, // Cosine + RSX_FP_OPCODE_SIN = 0x23, // Sine + RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats + RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats + RSX_FP_OPCODE_POW = 0x26, // Power + RSX_FP_OPCODE_PKB = 0x27, // Pack bytes + RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes + RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits + RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16 + RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform) + RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation + RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma + RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition + RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD + RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias + RSX_FP_OPCODE_TEXBEM = 0x33, + RSX_FP_OPCODE_TXPBEM = 0x34, + RSX_FP_OPCODE_BEMLUM = 0x35, + RSX_FP_OPCODE_REFL = 0x36, // Reflection vector + RSX_FP_OPCODE_TIMESWTEX = 0x37, + RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product + RSX_FP_OPCODE_NRM = 0x39, // Normalize + RSX_FP_OPCODE_DIV = 0x3A, // Division + RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root + RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT + RSX_FP_OPCODE_FENCT = 0x3D, // Fence T? + RSX_FP_OPCODE_FENCB = 0x3E, // Fence B? + RSX_FP_OPCODE_BRK = 0x40, // Break + RSX_FP_OPCODE_CAL = 0x41, // Subroutine call + RSX_FP_OPCODE_IFE = 0x42, // If + RSX_FP_OPCODE_LOOP = 0x43, // Loop + RSX_FP_OPCODE_REP = 0x44, // Repeat + RSX_FP_OPCODE_RET = 0x45, // Return + + + // Custom opcodes for dependency injection + RSX_FP_OPCODE_OR16_LO = 0x46, // Performs a 16-bit OR, taking one register channel as input and overwriting low 16 bits of the output + RSX_FP_OPCODE_OR16_HI = 0x47, // Same as the lo variant but now overwrites the high 16-bit block + }; + + namespace FP + { + // Returns number of operands consumed by an instruction + u8 get_operand_count(FP_opcode opcode); + + // Returns a lane mask for the given operand. + // The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel. + u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + + // Resolved vector lane mask with swizzles applied. + u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + + // Returns true on delay slot instructions. + bool is_delay_slot(const Instruction* instruction); + + // Generate register references + RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + RegisterRef get_dst_register(const Instruction* instruction); + + // Convert vector mask to file ranges + rsx::simple_array get_register_file_range(const RegisterRef& reg); + + // Compile a register file annotated blob to register references + std::vector compile_register_file(const std::array& file); + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp index d8de4eda0b..1f1e5b3678 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp @@ -1,5 +1,4 @@ #include "stdafx.h" - #include "CFG.h" #include "Emu/RSX/Common/simple_array.hpp" @@ -75,8 +74,19 @@ namespace rsx::assembler { if (auto found = find_block_for_pc(id)) { - parent->insert_succ(found, edge_type); - found->insert_pred(parent, edge_type); + auto succ = found; + if (found->is_of_type(EdgeType::ELSE) && + (edge_type == EdgeType::ENDIF || edge_type == EdgeType::ENDLOOP)) + { + // If we landed on an "ELSE" node, link to its "ENDIF" counterpart + auto if_parent = found->pred.front().from; + auto endif_edge = std::find_if(if_parent->succ.begin(), if_parent->succ.end(), FN(x.type == EdgeType::ENDIF)); + ensure(endif_edge != if_parent->succ.end(), "CFG: Invalid ELSE node"); + succ = endif_edge->to; + } + + parent->insert_succ(succ, edge_type); + succ->insert_pred(parent, edge_type); return found; } @@ -101,6 +111,43 @@ namespace rsx::assembler if (found) { + auto front_edge = std::find_if(bb->pred.begin(), bb->pred.end(), FN(x.type != EdgeType::ENDIF && x.type != EdgeType::ENDLOOP)); + if (front_edge != bb->pred.end()) + { + auto parent = ensure(front_edge->from); + switch (front_edge->type) + { + case EdgeType::IF: + case EdgeType::ELSE: + { + // Find the merge node from the parent. + auto succ = std::find_if(parent->succ.begin(), parent->succ.end(), FN(x.type == EdgeType::ENDIF)); + ensure(succ != parent->succ.end(), "CFG: Broken IF linkage. Please report to developers."); + bb->insert_succ(succ->to, EdgeType::ENDIF); + succ->to->insert_pred(bb, EdgeType::ENDIF); + break; + } + case EdgeType::LOOP: + { + // Find the merge node from the parent + auto succ = std::find_if(parent->succ.begin(), parent->succ.end(), FN(x.type == EdgeType::ENDLOOP)); + ensure(succ != parent->succ.end(), "CFG: Broken LOOP linkage. Please report to developers."); + bb->insert_succ(succ->to, EdgeType::ENDLOOP); + succ->to->insert_pred(bb, EdgeType::ENDLOOP); + break; + } + default: + // Missing an edge type? + rsx_log.error("CFG: Unexpected block exit. Report to developers."); + break; + } + } + else if (bb->pred.empty()) + { + // Impossible situation. + rsx_log.error("CFG: Child block has no parent but has successor! Report to developers."); + } + bb = *found; } @@ -113,7 +160,7 @@ namespace rsx::assembler src2.HEX = decoded._u32[3]; end = !!dst.end; - const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6); + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); if (opcode == RSX_FP_OPCODE_NOP) { @@ -126,6 +173,7 @@ namespace rsx::assembler std::memcpy(ir_inst.bytecode, &decoded._u32[0], 16); ir_inst.length = 4; ir_inst.addr = pc * 16; + ir_inst.opcode = opcode; switch (opcode) { @@ -174,6 +222,7 @@ namespace rsx::assembler ir_inst.length += 4; pc++; } + break; } pc++; diff --git a/rpcs3/Emu/RSX/Program/Assembler/IR.h b/rpcs3/Emu/RSX/Program/Assembler/IR.h index 65960f3d99..635aec7209 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/IR.h +++ b/rpcs3/Emu/RSX/Program/Assembler/IR.h @@ -10,6 +10,16 @@ namespace rsx::assembler { int id = 0; bool f16 = false; + + bool operator == (const Register& other) const + { + return id == other.id && f16 == other.f16; + } + + std::string to_string() const + { + return std::string(f16 ? "H" : "R") + std::to_string(id); + } }; struct RegisterRef @@ -19,7 +29,7 @@ namespace rsx::assembler // Vector information union { - u32 mask; + u32 mask = 0; struct { @@ -29,6 +39,16 @@ namespace rsx::assembler bool w : 1; }; }; + + operator bool() const + { + return !!mask; + } + + bool operator == (const RegisterRef& other) const + { + return reg == other.reg && mask == other.mask; + } }; struct Instruction @@ -71,6 +91,7 @@ namespace rsx::assembler struct BasicBlock { u32 id = 0; + std::vector instructions; // Program instructions for the RSX processor std::vector succ; // Forward edges. Sorted closest first. std::vector pred; // Back edges. Sorted closest first. @@ -78,6 +99,9 @@ namespace rsx::assembler std::vector prologue; // Prologue, created by passes std::vector epilogue; // Epilogue, created by passes + std::vector input_list; // Register inputs. + std::vector clobber_list; // Clobbered outputs + FlowEdge* insert_succ(BasicBlock* b, EdgeType type = EdgeType::NONE) { FlowEdge e{ .type = type, .from = this, .to = b }; @@ -91,5 +115,25 @@ namespace rsx::assembler pred.push_back(e); return &pred.back(); } + + bool is_of_type(EdgeType type) const + { + return pred.size() == 1 && + pred.front().type == type; + } + + bool has_sibling_of_type(EdgeType type) const + { + if (pred.size() != 1) + { + return false; + } + + auto source_node = pred.front().from; + return std::find_if( + source_node->succ.begin(), + source_node->succ.end(), + FN(x.type == type)) != source_node->succ.end(); + } }; } diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp new file mode 100644 index 0000000000..9b031f2a0e --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp @@ -0,0 +1,230 @@ +#include "stdafx.h" + +#include "RegisterAnnotationPass.h" +#include "Emu/RSX/Program/Assembler/FPOpcodes.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include +#include + +namespace rsx::assembler::FP +{ + static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers + static constexpr char content_unknown = 0; + static constexpr char content_float32 = 'R'; + static constexpr char content_float16 = 'H'; + static constexpr char content_dual = 'D'; + + bool is_delay_slot(const Instruction& instruction) + { + const OPDEST dst{ .HEX = instruction.bytecode[0] }; + const SRC0 src0{ .HEX = instruction.bytecode[1] }; + const SRC1 src1{ .HEX = instruction.bytecode[2] }; + + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self + dst.fp16 != src0.fp16 || // Must really be the same register + src0.abs || src0.neg || + dst.saturate) // Precision modifier + { + return false; + } + + switch (dst.prec) + { + case RSX_FP_PRECISION_REAL: + case RSX_FP_PRECISION_UNKNOWN: + break; + case RSX_FP_PRECISION_HALF: + if (!src0.fp16) return false; + break; + case RSX_FP_PRECISION_FIXED12: + case RSX_FP_PRECISION_FIXED9: + case RSX_FP_PRECISION_SATURATE: + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; + } + + std::vector compile_register_file(const std::array& file) + { + std::vector results; + + // F16 register processing + for (int reg16 = 0; reg16 < 48; ++reg16) + { + const u32 offset = reg16 * 8; + auto word = *reinterpret_cast(&file[offset]); + + if (!word) [[ likely ]] + { + // Trivial rejection, very commonly hit. + continue; + } + + RegisterRef ref{ .reg {.id = reg16, .f16 = true } }; + ref.x = (file[offset] == content_dual || file[offset] == content_float16); + ref.y = (file[offset + 2] == content_dual || file[offset + 2] == content_float16); + ref.z = (file[offset + 4] == content_dual || file[offset + 4] == content_float16); + ref.w = (file[offset + 6] == content_dual || file[offset + 6] == content_float16); + + if (ref) + { + results.push_back(std::move(ref)); + } + } + + // Helper to check a span for 32-bit access + auto match_any_32 = [](const std::span lanes) + { + return std::any_of(lanes.begin(), lanes.end(), FN(x == content_dual || x == content_float32)); + }; + + // F32 register processing + for (int reg32 = 0; reg32 < 24; ++reg32) + { + const u32 offset = reg32 * 16; + auto word0 = *reinterpret_cast(&file[offset]); + auto word1 = *reinterpret_cast(&file[offset + 8]); + + if (!word0 && !word1) [[ likely ]] + { + // Trivial rejection, very commonly hit. + continue; + } + + RegisterRef ref{ .reg {.id = reg32, .f16 = false } }; + if (word0) + { + ref.x = match_any_32({ &file[offset], 4 }); + ref.y = match_any_32({ &file[offset + 4], 4 }); + } + + if (word1) + { + ref.z = match_any_32({ &file[offset + 8], 4 }); + ref.w = match_any_32({ &file[offset + 12], 4 }); + } + + if (ref) + { + results.push_back(std::move(ref)); + } + } + + return results; + } + + // Decay instructions into register references + void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog, bool skip_delay_slots) + { + for (auto& instruction : block->instructions) + { + if (skip_delay_slots && is_delay_slot(instruction)) + { + continue; + } + + const u32 operand_count = get_operand_count(static_cast(instruction.opcode)); + for (u32 i = 0; i < operand_count; i++) + { + RegisterRef reg = get_src_register(prog, &instruction, i); + if (!reg.mask) + { + // Likely a literal constant + continue; + } + + instruction.srcs.push_back(std::move(reg)); + } + + RegisterRef dst = get_dst_register(&instruction); + if (dst) + { + instruction.dsts.push_back(std::move(dst)); + } + } + } + + // Annotate each block with input and output lanes (read and clobber list) + void annotate_block_io(BasicBlock* block) + { + alignas(16) std::array output_register_file; + alignas(16) std::array input_register_file; // We'll eventually replace with a bitfield mask, but for ease of debugging, we use char for now + + std::memset(output_register_file.data(), content_unknown, register_file_length); + std::memset(input_register_file.data(), content_unknown, register_file_length); + + for (const auto& instruction : block->instructions) + { + for (const auto& src : instruction.srcs) + { + const auto read_bytes = get_register_file_range(src); + const char expected_type = src.reg.f16 ? content_float16 : content_float32; + for (const auto& index : read_bytes) + { + if (output_register_file[index] != content_unknown) + { + // Something already wrote to this lane + continue; + } + + if (input_register_file[index] == expected_type) + { + // We already know about this input + continue; + } + + if (input_register_file[index] == 0) + { + // Not known, tag as input + input_register_file[index] = expected_type; + continue; + } + + // Collision on the lane + input_register_file[index] = content_dual; + } + } + + if (!instruction.dsts.empty()) + { + const auto& dst = instruction.dsts.front(); + const auto write_bytes = get_register_file_range(dst); + const char expected_type = dst.reg.f16 ? content_float16 : content_float32; + + for (const auto& index : write_bytes) + { + output_register_file[index] = expected_type; + } + } + } + + // Compile the input and output refs into register references + block->clobber_list = compile_register_file(output_register_file); + block->input_list = compile_register_file(input_register_file); + } + + void RegisterAnnotationPass::run(FlowGraph& graph) + { + for (auto& block : graph.blocks) + { + annotate_instructions(&block, m_prog, m_config.skip_delay_slots); + annotate_block_io(&block); + } + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h new file mode 100644 index 0000000000..b5cab3da85 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h @@ -0,0 +1,34 @@ +#pragma once + +#include "../../CFG.h" + +struct RSXFragmentProgram; + +namespace rsx::assembler::FP +{ + struct RegisterAnnotationPassOptions + { + bool skip_delay_slots = false; // When enabled, detect delay slots and ignore annotating them. + }; + + // The annotation pass annotates each basic block with 2 pieces of information: + // 1. The "input" register list for a block. + // 2. The "output" register list for a block (clobber list). + // The information can be used by other passes to set up prologue/epilogue on each block. + // The pass also populates register reference members of each instruction, such as the input and output lanes. + class RegisterAnnotationPass : public CFGPass + { + public: + RegisterAnnotationPass( + const RSXFragmentProgram& prog, + const RegisterAnnotationPassOptions& options = {}) + : m_prog(prog), m_config(options) + {} + + void run(FlowGraph& graph) override; + + private: + const RSXFragmentProgram& m_prog; + RegisterAnnotationPassOptions m_config; + }; +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp new file mode 100644 index 0000000000..c5b24b35bc --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp @@ -0,0 +1,490 @@ +#include "stdafx.h" + +#include "RegisterDependencyPass.h" +#include "Emu/RSX/Program/Assembler/FPOpcodes.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include +#include + +namespace rsx::assembler::FP +{ + static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers + static constexpr char content_unknown = 0; + static constexpr char content_float32 = 'R'; + static constexpr char content_float16 = 'H'; + static constexpr char content_dual = 'D'; + + using register_file_t = std::array; + + struct DependencyPassContext + { + std::unordered_map exec_register_map; + std::unordered_map sync_register_map; + }; + + enum Register32BarrierFlags + { + NONE = 0, + OR_WORD0 = 1, + OR_WORD1 = 2, + DEFAULT = OR_WORD0 | OR_WORD1 + }; + + struct RegisterBarrier32 + { + RegisterRef ref; + u32 flags[4]; + }; + + std::vector decode_lanes16(const std::unordered_set& lanes) + { + std::vector result; + + for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 8) + { + // Each register has 4 16-bit lanes + u32 mask = 0; + if (lanes.contains(file_offset + 0)) mask |= (1u << 0); + if (lanes.contains(file_offset + 2)) mask |= (1u << 1); + if (lanes.contains(file_offset + 4)) mask |= (1u << 2); + if (lanes.contains(file_offset + 6)) mask |= (1u << 3); + + if (mask == 0) + { + continue; + } + + RegisterRef ref{ .reg{.id = static_cast(index), .f16 = true } }; + ref.mask = mask; + result.push_back(std::move(ref)); + } + return result; + } + + std::vector decode_lanes32(const std::unordered_set& lanes) + { + std::vector result; + + for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 16) + { + // Each register has 8 16-bit lanes + RegisterBarrier32 barrier{}; + auto& ref = barrier.ref; + + for (u32 lane = 0; lane < 16; lane += 2) + { + if (!lanes.contains(file_offset + lane)) + { + continue; + } + + const u32 ch = (lane / 4); + const u32 flags = (lane & 3) + ? Register32BarrierFlags::OR_WORD1 + : Register32BarrierFlags::OR_WORD0; + + ref.mask |= (1u << ch); + barrier.flags[ch] |= flags; + } + + if (ref.mask == 0) + { + continue; + } + + ref.reg = {.id = static_cast(index), .f16 = false }; + result.push_back(std::move(barrier)); + } + + return result; + } + + std::vector build_barrier32(const RegisterBarrier32& barrier) + { + // Upto 4 instructions are needed per 32-bit register + // R0.x = packHalf2x16(H0.xy) + // R0.y = packHalf2x16(H0.zw); + // R0.z = packHalf2x16(H1.xy); + // R0.w = packHalf2x16(H1.zw); + + std::vector result; + + for (u32 mask = barrier.ref.mask, ch = 0; mask > 0; mask >>= 1, ++ch) + { + if (!(mask & 1)) + { + continue; + } + + const auto& reg = barrier.ref.reg; + const auto reg_id = reg.id; + + Instruction instruction{}; + OPDEST dst{}; + dst.prec = RSX_FP_PRECISION_REAL; + dst.fp16 = 0; + dst.dest_reg = reg_id; + dst.write_mask = (1u << ch); + + const u32 src_reg_id = (ch / 2) + (reg_id * 2); + const bool is_word0 = !(ch & 1); // Only even + + SRC0 src0{}; + if (is_word0) + { + src0.swizzle_x = 0; + src0.swizzle_y = 1; + } + else + { + src0.swizzle_x = 2; + src0.swizzle_y = 3; + } + + src0.swizzle_z = 2; + src0.swizzle_w = 3; + src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src0.tmp_reg_index = src_reg_id; + src0.fp16 = 1; + + // Prepare source 1 to match the output in case we need to encode an OR + SRC1 src1{}; + src1.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src1.tmp_reg_index = reg_id; + src1.swizzle_x = ch; + src1.swizzle_y = ch; + src1.swizzle_z = ch; + src1.swizzle_w = ch; + + u32 opcode = 0; + switch (barrier.flags[ch]) + { + case Register32BarrierFlags::DEFAULT: + opcode = RSX_FP_OPCODE_PK2; + break; + case Register32BarrierFlags::OR_WORD0: + opcode = RSX_FP_OPCODE_OR16_LO; + // Swap inputs + std::swap(src0.HEX, src1.HEX); + break; + case Register32BarrierFlags::OR_WORD1: + opcode = RSX_FP_OPCODE_OR16_HI; + src0.swizzle_x = src0.swizzle_y; + std::swap(src0.HEX, src1.HEX); + break; + case Register32BarrierFlags::NONE: + default: + fmt::throw_exception("Unexpected lane barrier with no mask."); + } + + dst.opcode = opcode & 0x3F; + src1.opcode_hi = (opcode > 0x3F) ? 1 : 0; + src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; + + instruction.opcode = opcode; + instruction.bytecode[0] = dst.HEX; + instruction.bytecode[1] = src0.HEX; + instruction.bytecode[2] = src1.HEX; + + Register src_reg{ .id = static_cast(src_reg_id), .f16 = true }; + instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF }); + instruction.dsts.push_back({ .reg{ .id = reg_id, .f16 = false }, .mask = (1u << ch) }); + result.push_back(std::move(instruction)); + } + + return result; + } + + std::vector build_barrier16(const RegisterRef& reg) + { + // H0.xy = unpackHalf2x16(R0.x) + // H0.zw = unpackHalf2x16(R0.y) + // H1.xy = unpackHalf2x16(R0.z) + // H1.zw = unpackHalf2x16(R0.w) + + std::vector result; + + for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch) + { + if (!(mask & 1)) + { + continue; + } + + Instruction instruction{}; + OPDEST dst{}; + dst.opcode = RSX_FP_OPCODE_UP2; + dst.prec = RSX_FP_PRECISION_HALF; + dst.fp16 = 1; + dst.dest_reg = reg.reg.id; + dst.write_mask = 1u << ch; + + const u32 src_reg_id = reg.reg.id / 2; + const bool is_odd_reg = !!(reg.reg.id & 1); + const bool is_odd_ch = !!(ch & 1); + const bool is_word0 = ch < 2; + + // If we're an even channel, we should also write the next channel (y/w) + if (!is_odd_ch && (mask & 2)) + { + mask >>= 1; + ++ch; + dst.write_mask |= (1u << ch); + } + + SRC0 src0{}; + src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; + + if (is_word0) + { + src0.swizzle_x = is_odd_reg ? 2 : 0; + } + else + { + src0.swizzle_x = is_odd_reg ? 3 : 1; + } + + src0.swizzle_y = 1; + src0.swizzle_z = 2; + src0.swizzle_w = 3; + src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src0.tmp_reg_index = src_reg_id; + + instruction.opcode = dst.opcode; + instruction.bytecode[0] = dst.HEX; + instruction.bytecode[1] = src0.HEX; + + Register src_reg{ .id = static_cast(src_reg_id), .f16 = true }; + instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF }); + instruction.dsts.push_back({ .reg{.id = reg.reg.id, .f16 = false }, .mask = dst.write_mask }); + result.push_back(std::move(instruction)); + } + + return result; + } + + std::vector resolve_dependencies(const std::unordered_set& lanes, bool f16) + { + std::vector result; + + if (f16) + { + const auto regs = decode_lanes16(lanes); + for (const auto& ref : regs) + { + auto instructions = build_barrier16(ref); + result.insert(result.end(), instructions.begin(), instructions.end()); + } + + return result; + } + + const auto barriers = decode_lanes32(lanes); + for (const auto& barrier : barriers) + { + auto instructions = build_barrier32(barrier); + result.insert(result.end(), std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end())); + } + + return result; + } + + void insert_dependency_barriers(DependencyPassContext& ctx, BasicBlock* block) + { + register_file_t& register_file = ctx.exec_register_map[block]; + std::memset(register_file.data(), content_unknown, register_file_length); + + std::unordered_set barrier16; + std::unordered_set barrier32; + + // This subpass does not care about the prologue and epilogue and assumes each block is unique. + for (auto it = block->instructions.begin(); it != block->instructions.end(); ++it) + { + auto& inst = *it; + + barrier16.clear(); + barrier32.clear(); + + for (const auto& src : inst.srcs) + { + const auto read_bytes = get_register_file_range(src); + const char expected_type = src.reg.f16 ? content_float16 : content_float32; + for (const auto& index : read_bytes) + { + if (register_file[index] == content_unknown) + { + // Skip input + continue; + } + + if (register_file[index] == expected_type || register_file[index] == content_dual) + { + // Match - nothing to do + continue; + } + + // Collision on the lane + register_file[index] = content_dual; + (src.reg.f16 ? barrier16 : barrier32).insert(index); + } + } + + for (const auto& dst : inst.dsts) + { + const auto write_bytes = get_register_file_range(dst); + const char expected_type = dst.reg.f16 ? content_float16 : content_float32; + + for (const auto& index : write_bytes) + { + register_file[index] = expected_type; + } + } + + // We need to inject some barrier instructions + if (!barrier16.empty()) + { + auto barrier16_in = decode_lanes16(barrier16); + std::vector instructions; + instructions.reserve(barrier16_in.size()); + + for (const auto& reg : barrier16_in) + { + auto barrier = build_barrier16(reg); + instructions.insert(instructions.end(), std::make_move_iterator(barrier.begin()), std::make_move_iterator(barrier.end())); + } + + it = block->instructions.insert(it, std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end())); + std::advance(it, instructions.size()); + } + + if (!barrier32.empty()) + { + auto barrier32_in = decode_lanes32(barrier32); + std::vector instructions; + instructions.reserve(barrier32_in.size()); + + for (const auto& reg : barrier32_in) + { + auto barrier = build_barrier32(reg); + instructions.insert(instructions.end(), std::make_move_iterator(barrier.begin()), std::make_move_iterator(barrier.end())); + } + + it = block->instructions.insert(it, std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end())); + std::advance(it, instructions.size()); + } + } + } + + void insert_block_register_dependency(DependencyPassContext& ctx, BasicBlock* block, const std::unordered_set& lanes, bool f16) + { + std::unordered_set clobbered_lanes; + std::unordered_set lanes_to_search; + + for (auto& back_edge : block->pred) + { + auto target = back_edge.from; + + // Quick check - if we've reached an IF-ELSE anchor, don't traverse upwards. + // The IF and ELSE edges are already a complete set and will bre processed before this node. + if (back_edge.type == EdgeType::ENDIF && + &back_edge == &block->pred.back() && + target->succ.size() == 3 && + target->succ[1].type == EdgeType::ELSE && + target->succ[2].type == EdgeType::ENDIF && + target->succ[2].to == block) + { + return; + } + + // Did this target even clobber our register? + ensure(ctx.exec_register_map.find(target) != ctx.exec_register_map.end(), "Block has not been pre-processed"); + + if (ctx.sync_register_map.find(target) == ctx.sync_register_map.end()) + { + auto& blob = ctx.sync_register_map[target]; + std::memset(blob.data(), content_unknown, register_file_length); + } + + auto& sync_register_file = ctx.sync_register_map[target]; + const auto& exec_register_file = ctx.exec_register_map[target]; + const auto clobber_type = f16 ? content_float32 : content_float16; + + lanes_to_search.clear(); + clobbered_lanes.clear(); + + for (auto& lane : lanes) + { + if (exec_register_file[lane] == clobber_type && + sync_register_file[lane] == content_unknown) + { + clobbered_lanes.insert(lane); + sync_register_file[lane] = content_dual; + continue; + } + + if (exec_register_file[lane] == content_unknown) + { + lanes_to_search.insert(lane); + } + } + + if (!clobbered_lanes.empty()) + { + auto instructions = resolve_dependencies(clobbered_lanes, f16); + target->epilogue.insert(target->epilogue.end(), std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end())); + } + + if (lanes_to_search.empty()) + { + continue; + } + + // We have some missing lanes. Search upwards + if (!target->pred.empty()) + { + // We only need to search the last predecessor which is the true "root" of the branch + insert_block_register_dependency(ctx, target, lanes_to_search, f16); + } + } + } + + void insert_block_dependencies(DependencyPassContext& ctx, BasicBlock* block) + { + auto range_from_ref = [](const RegisterRef& ref) + { + const auto range = get_register_file_range(ref); + + std::unordered_set result; + for (const auto& value : range) + { + result.insert(value); + } + return result; + }; + + for (auto& ref : block->input_list) + { + const auto range = range_from_ref(ref); + insert_block_register_dependency(ctx, block, range, ref.reg.f16); + } + } + + void RegisterDependencyPass::run(FlowGraph& graph) + { + DependencyPassContext ctx{}; + + // First, run intra-block dependency + for (auto& block : graph.blocks) + { + insert_dependency_barriers(ctx, &block); + } + + // Then, create prologue/epilogue instructions + // Traverse the list in reverse order to bubble up dependencies correctly. + for (auto it = graph.blocks.rbegin(); it != graph.blocks.rend(); ++it) + { + insert_block_dependencies(ctx, &(*it)); + } + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h new file mode 100644 index 0000000000..48068691e1 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h @@ -0,0 +1,15 @@ +#pragma once + +#include "../../CFG.h" + +namespace rsx::assembler::FP +{ + // The register dependency pass identifies data hazards for each basic block and injects barrier instructions. + // Real PS3 does not have explicit barriers, but does instead often use delay slots or fence instructions to stall until a specific hardware unit clears the fence to advance. + // For decompiled shaders, we have the problem that aliasing is not real and is instead simulated. We do not have access to unions on the GPU without really nasty tricks. + class RegisterDependencyPass : public CFGPass + { + public: + void run(FlowGraph& graph) override; + }; +} diff --git a/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp b/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp index a06818de10..1dfe83e468 100644 --- a/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp @@ -273,7 +273,7 @@ void CgBinaryDisasm::TaskFP() src2.HEX = GetData(data[3]); m_step = 4 * sizeof(u32); - m_opcode = dst.opcode | (src1.opcode_is_branch << 6); + m_opcode = dst.opcode | (src1.opcode_hi << 6); auto SCT = [&]() { diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 2ebfd7d8d7..94b92ce98e 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -3,12 +3,19 @@ #include "FragmentProgramDecompiler.h" #include "ProgramStateCache.h" +#include "Assembler/Passes/FP/RegisterAnnotationPass.h" +#include "Assembler/Passes/FP/RegisterDependencyPass.h" + +#include "Emu/system_config.h" + #include namespace rsx { namespace fragment_program { + using namespace rsx::assembler; + static const std::string reg_table[] = { "wpos", @@ -17,10 +24,33 @@ namespace rsx "tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9", "ssa" }; + + static const std::vector s_fp32_output_set = + { + {.reg {.id = 0, .f16 = false }, .mask = 0xf }, + {.reg {.id = 2, .f16 = false }, .mask = 0xf }, + {.reg {.id = 3, .f16 = false }, .mask = 0xf }, + {.reg {.id = 4, .f16 = false }, .mask = 0xf }, + }; + + static const std::vector s_fp16_output_set = + { + {.reg {.id = 0, .f16 = true }, .mask = 0xf }, + {.reg {.id = 4, .f16 = true }, .mask = 0xf }, + {.reg {.id = 6, .f16 = true }, .mask = 0xf }, + {.reg {.id = 8, .f16 = true }, .mask = 0xf }, + }; + + static const RegisterRef s_z_export_reg = + { + .reg {.id = 1, .f16 = false }, + .mask = (1u << 2) + }; } } using namespace rsx::fragment_program; +using namespace rsx::assembler; // SIMD vector lanes enum VectorLane : u8 @@ -31,6 +61,26 @@ enum VectorLane : u8 W = 3, }; +std::vector get_fragment_program_output_set(u32 ctrl, u32 mrt_count) +{ + std::vector result; + if (mrt_count > 0) + { + result = (ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) + ? s_fp32_output_set + : s_fp16_output_set; + + result.resize(mrt_count); + } + + if (ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) + { + result.push_back(s_z_export_reg); + } + + return result; +} + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -151,8 +201,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) } const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg; - ensure(reg_index < temp_registers.size()); - if (dst.opcode == RSX_FP_OPCODE_MOV && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP && src0.tmp_reg_index == reg_index) @@ -165,8 +213,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) return; } } - - temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); } void FragmentProgramDecompiler::AddFlowOp(const std::string& code) @@ -522,26 +568,7 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) switch (src.reg_type) { case RSX_FP_REGISTER_TYPE_TEMP: - - if (!src.fp16) - { - if (dst.opcode == RSX_FP_OPCODE_UP16 || - dst.opcode == RSX_FP_OPCODE_UP2 || - dst.opcode == RSX_FP_OPCODE_UP4 || - dst.opcode == RSX_FP_OPCODE_UPB || - dst.opcode == RSX_FP_OPCODE_UPG) - { - auto ® = temp_registers[src.tmp_reg_index]; - if (reg.requires_gather(src.swizzle_x)) - { - properties.has_gather_op = true; - AddReg(src.tmp_reg_index, src.fp16); - ret = getFloatTypeName(4) + reg.gather_r(); - break; - } - } - } - else if (precision_modifier == RSX_FP_PRECISION_HALF) + if (src.fp16 && precision_modifier == RSX_FP_PRECISION_HALF) { // clamp16() is not a cheap operation when emulated; avoid at all costs precision_modifier = RSX_FP_PRECISION_REAL; @@ -762,7 +789,6 @@ std::string FragmentProgramDecompiler::BuildCode() const std::string float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4); const std::string init_value = float4_type + "(0.)"; std::array output_register_names; - std::array ouput_register_indices = { 0, 2, 3, 4 }; // Holder for any "cleanup" before exiting main std::stringstream main_epilogue; @@ -772,17 +798,6 @@ std::string FragmentProgramDecompiler::BuildCode() { // Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!! m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value); - - auto& r1 = temp_registers[1]; - if (r1.requires_gather(VectorLane::Z)) - { - // r1.zw was not written to - properties.has_gather_op = true; - main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n"; - - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. - rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered."); - } } // Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z) @@ -810,33 +825,6 @@ std::string FragmentProgramDecompiler::BuildCode() continue; } - const auto block_index = ouput_register_indices[n]; - auto& r = temp_registers[block_index]; - - if (fp16_out) - { - // Check if we need a split/extract op - if (r.requires_split(0)) - { - main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n"; - - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. - rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name); - } - - continue; - } - - if (!r.requires_gather128()) - { - // Nothing to do - continue; - } - - // We need to gather the data from existing registers - main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n"; - properties.has_gather_op = true; - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name); } @@ -1024,28 +1012,6 @@ std::string FragmentProgramDecompiler::BuildCode() OS << Format(divsq_func); } - // Declare register gather/merge if needed - if (properties.has_gather_op) - { - std::string float2 = getFloatTypeName(2); - - OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n"; - OS << "{\n"; - OS << " float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n"; - OS << " float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n"; - OS << " float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n"; - OS << " float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n"; - OS << " return " << float4 << "(x, y, z, w);\n"; - OS << "}\n\n"; - - OS << float2 << " gather(" << float4 << " _h)\n"; - OS << "{\n"; - OS << " float x = uintBitsToFloat(packHalf2x16(_h.xy));\n"; - OS << " float y = uintBitsToFloat(packHalf2x16(_h.zw));\n"; - OS << " return " << float2 << "(x, y);\n"; - OS << "}\n\n"; - } - if (properties.has_dynamic_register_load) { OS << @@ -1149,6 +1115,14 @@ bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode) return true; case RSX_FP_OPCODE_PKB: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packUnorm4x8($0)))"); return true; case RSX_FP_OPCODE_SIN: SetDst("sin($0.xxxx)"); return true; + + // Custom ISA extensions for 16-bit OR + case RSX_FP_OPCODE_OR16_HI: + SetDst("$float4(uintBitsToFloat((floatBitsToUint($0.x) & 0x0000ffff) | (packHalf2x16($1.xx) & 0xffff0000)))"); + return true; + case RSX_FP_OPCODE_OR16_LO: + SetDst("$float4(uintBitsToFloat((floatBitsToUint($0.x) & 0xffff0000) | (packHalf2x16($1.xx) & 0x0000ffff)))"); + return true; } return false; } @@ -1295,7 +1269,37 @@ bool FragmentProgramDecompiler::handle_tex_srb(u32 opcode) std::string FragmentProgramDecompiler::Decompile() { - const auto graph = rsx::assembler::deconstruct_fragment_program(m_prog); + auto graph = deconstruct_fragment_program(m_prog); + + if (!graph.blocks.empty()) + { + // The RSX CFG is missing the output block. We inject a fake tail block that ingests the ROP outputs. + BasicBlock* rop_block = nullptr; + BasicBlock* tail_block = &graph.blocks.back(); + if (tail_block->instructions.empty()) + { + // Merge block. Use this directly + rop_block = tail_block; + } + else + { + graph.blocks.push_back({}); + rop_block = &graph.blocks.back(); + + tail_block->insert_succ(rop_block); + rop_block->insert_pred(tail_block); + } + + const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count); + rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end()); + + FP::RegisterAnnotationPass annotation_pass{ m_prog, { .skip_delay_slots = true } }; + FP::RegisterDependencyPass dependency_pass{}; + + annotation_pass.run(graph); + dependency_pass.run(graph); + } + m_size = 0; m_location = 0; m_loop_count = 0; @@ -1303,57 +1307,105 @@ std::string FragmentProgramDecompiler::Decompile() m_is_valid_ucode = true; m_constant_offsets.clear(); - enum + // For GLSL scope wind/unwind. We store the min scope depth and loop count for each block and "unwind" to it. + // This should recover information lost when multiple nodes converge on a single merge node or even skip a merge node as is the case with "ELSE" nodes. + std::unordered_map> block_data; + + auto push_block_info = [&](const BasicBlock* block) { - FORCE_NONE, - FORCE_SCT, - FORCE_SCB, + u32 loop = m_loop_count; + int level = m_code_level; + + auto found = block_data.find(block); + if (found != block_data.end()) + { + level = std::min(level, found->second.first); + loop = std::min(loop, found->second.second); + } + + block_data[block] = { level, loop }; }; - int forced_unit = FORCE_NONE; + auto emit_block = [&](const std::vector& instructions) + { + for (auto& inst : instructions) + { + m_instruction = &inst; + dst.HEX = inst.bytecode[0]; + src0.HEX = inst.bytecode[1]; + src1.HEX = inst.bytecode[2]; + src2.HEX = inst.bytecode[3]; + + ensure(handle_tex_srb(inst.opcode) || handle_sct_scb(inst.opcode), "Unsupported operation"); + } + }; for (const auto &block : graph.blocks) { - // TODO: Handle block prologue if any + auto found = block_data.find(&block); + if (found != block_data.end()) + { + const auto [level, loop] = found->second; + for (int i = m_code_level; i > level; i--) + { + m_code_level--; + AddCode("}"); + } + + m_loop_count = loop; + } + if (!block.pred.empty()) { - // CFG guarantees predecessors are sorted, closest one first - for (const auto& pred : block.pred) + // Predecessors are always sorted closest last. + // This gives some adjacency info and tells us how the previous block connects to this one. + const auto& pred = block.pred.back(); + switch (pred.type) { - switch (pred.type) - { - case rsx::assembler::EdgeType::ENDLOOP: - m_loop_count--; - [[ fallthrough ]]; - case rsx::assembler::EdgeType::ENDIF: - m_code_level--; - AddCode("}"); - break; - case rsx::assembler::EdgeType::LOOP: - m_loop_count++; - [[ fallthrough ]]; - case rsx::assembler::EdgeType::IF: - // Instruction will be inserted by the SIP decoder - AddCode("{"); - m_code_level++; - break; - case rsx::assembler::EdgeType::ELSE: - // This one needs more testing - m_code_level--; - AddCode("}"); - AddCode("else"); - AddCode("{"); - m_code_level++; - break; - default: - // Start a new block anyway - fmt::throw_exception("Unexpected block found"); - } + case EdgeType::LOOP: + m_loop_count++; + [[ fallthrough ]]; + case EdgeType::IF: + AddCode("{"); + m_code_level++; + break; + case EdgeType::ELSE: + AddCode("else"); + AddCode("{"); + m_code_level++; + break; + case EdgeType::ENDIF: + case EdgeType::ENDLOOP: + // Pure merge block? + break; + case EdgeType::NONE: + ensure(block.instructions.empty()); + break; + default: + fmt::throw_exception("Unhandled edge type %d", static_cast(pred.type)); + break; } } + if (!block.prologue.empty()) + { + AddCode("// Prologue"); + emit_block(block.prologue); + } + + const bool early_epilogue = + !block.epilogue.empty() && + !block.succ.empty() && + (block.succ.front().type == EdgeType::IF || block.succ.front().type == EdgeType::LOOP); + for (const auto& inst : block.instructions) { + if (early_epilogue && &inst == &block.instructions.back()) + { + AddCode("// Epilogue"); + emit_block(block.epilogue); + } + m_instruction = &inst; dst.HEX = inst.bytecode[0]; @@ -1363,11 +1415,9 @@ std::string FragmentProgramDecompiler::Decompile() opflags = 0; - const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6); - auto SIP = [&]() { - switch (opcode) + switch (m_instruction->opcode) { case RSX_FP_OPCODE_BRK: if (m_loop_count) AddFlowOp("break"); @@ -1377,12 +1427,10 @@ std::string FragmentProgramDecompiler::Decompile() rsx_log.error("Unimplemented SIP instruction: CAL"); break; case RSX_FP_OPCODE_FENCT: - AddCode("//FENCT"); - forced_unit = FORCE_SCT; + AddCode("// FENCT"); break; case RSX_FP_OPCODE_FENCB: - AddCode("//FENCB"); - forced_unit = FORCE_SCB; + AddCode("// FENCB"); break; case RSX_FP_OPCODE_IFE: AddCode("if($cond)"); @@ -1406,7 +1454,7 @@ std::string FragmentProgramDecompiler::Decompile() return true; }; - switch (opcode) + switch (m_instruction->opcode) { case RSX_FP_OPCODE_NOP: break; @@ -1415,19 +1463,10 @@ std::string FragmentProgramDecompiler::Decompile() AddFlowOp("_kill()"); break; default: - int prev_force_unit = forced_unit; - - // Some instructions do not respect forced unit - // Tested with Tales of Vesperia if (SIP()) break; - if (handle_tex_srb(opcode)) break; - - // FENCT/FENCB do not actually reject instructions if they dont match the forced unit - // Looks like they are optimization hints and not hard-coded forced paths - if (handle_sct_scb(opcode)) break; - forced_unit = FORCE_NONE; - - rsx_log.error("Unknown/illegal instruction: 0x%x (forced unit %d)", opcode, prev_force_unit); + if (handle_tex_srb(m_instruction->opcode)) break; + if (handle_sct_scb(m_instruction->opcode)) break; + rsx_log.error("Unknown/illegal instruction: 0x%x", m_instruction->opcode); break; } @@ -1435,16 +1474,28 @@ std::string FragmentProgramDecompiler::Decompile() if (dst.end) break; } - // TODO: Handle block epilogue if needed + if (!early_epilogue && !block.epilogue.empty()) + { + AddCode("// Epilogue"); + emit_block(block.epilogue); + } + + for (auto& succ : block.succ) + { + switch (succ.type) + { + case EdgeType::ENDIF: + case EdgeType::ENDLOOP: + case EdgeType::ELSE: + push_block_info(succ.to); + break; + default: + break; + } + } } - while (m_code_level > 1) - { - rsx_log.error("Hanging block found at end of shader. Malformed shader?"); - - m_code_level--; - AddCode("}"); - } + ensure(m_code_level == 1); // flush m_code_level m_code_level = 1; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index b68750bdfc..09a02804c3 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -1,6 +1,5 @@ #pragma once #include "ShaderParam.h" -#include "FragmentProgramRegister.h" #include "RSXFragmentProgram.h" #include "Assembler/CFG.h" @@ -53,8 +52,6 @@ class FragmentProgramDecompiler int m_code_level; std::unordered_map m_constant_offsets; - std::array temp_registers; - std::string GetMask() const; void SetDst(std::string code, u32 flags = 0); @@ -175,7 +172,6 @@ public: // Decoded properties (out) bool has_lit_op = false; - bool has_gather_op = false; bool has_no_output = false; bool has_discard_op = false; bool has_tex_op = false; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp deleted file mode 100644 index a14b142df6..0000000000 --- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp +++ /dev/null @@ -1,196 +0,0 @@ -#include "stdafx.h" -#include "FragmentProgramRegister.h" - -namespace rsx -{ - MixedPrecisionRegister::MixedPrecisionRegister() - { - std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined); - } - - void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w) - { - if (x) content_mask[0] = data_type_bits::f16; - if (y) content_mask[1] = data_type_bits::f16; - if (z) content_mask[2] = data_type_bits::f16; - if (w) content_mask[3] = data_type_bits::f16; - } - - void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w) - { - if (x) content_mask[4] = data_type_bits::f16; - if (y) content_mask[5] = data_type_bits::f16; - if (z) content_mask[6] = data_type_bits::f16; - if (w) content_mask[7] = data_type_bits::f16; - } - - void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w) - { - if (x) content_mask[0] = content_mask[1] = data_type_bits::f32; - if (y) content_mask[2] = content_mask[3] = data_type_bits::f32; - if (z) content_mask[4] = content_mask[5] = data_type_bits::f32; - if (w) content_mask[6] = content_mask[7] = data_type_bits::f32; - } - - void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w) - { - if (file_index == umax) - { - // First-time use. Initialize... - const u32 real_index = is_fp16 ? (index >> 1) : index; - file_index = real_index; - } - - if (is_fp16) - { - ensure((index / 2) == file_index); - - if (index & 1) - { - tag_h1(x, y, z, w); - return; - } - - tag_h0(x, y, z, w); - return; - } - - tag_r(x, y, z, w); - } - - std::string MixedPrecisionRegister::gather_r() const - { - const auto half_index = file_index << 1; - const std::string reg = "r" + std::to_string(file_index); - const std::string gather_half_regs[] = { - "gather(h" + std::to_string(half_index) + ")", - "gather(h" + std::to_string(half_index + 1) + ")" - }; - - std::string outputs[4]; - for (int ch = 0; ch < 4; ++ch) - { - // FIXME: This approach ignores mixed register bits. Not ideal!!!! - const auto channel0 = content_mask[ch * 2]; - const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16; - outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg; - } - - // Grouping. Only replace relevant bits... - if (outputs[0] == outputs[1]) outputs[0] = ""; - if (outputs[2] == outputs[3]) outputs[2] = ""; - - // Assemble - bool group = false; - std::string result = ""; - constexpr std::string_view swz_mask = "xyzw"; - - for (int ch = 0; ch < 4; ++ch) - { - if (outputs[ch].empty()) - { - group = true; - continue; - } - - if (!result.empty()) - { - result += ", "; - } - - if (group) - { - ensure(ch > 0); - group = false; - - if (outputs[ch] == reg) - { - result += reg + "." + swz_mask[ch - 1] + swz_mask[ch]; - continue; - } - - result += outputs[ch]; - continue; - } - - const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles - result += outputs[ch] + "." + swz_mask[subch]; - } - - // Optimize dual-gather (128-bit gather) to use special function - const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1]; - if (result == double_gather) - { - result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")"; - } - - return "(" + result + ")"; - } - - std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const - { - // Reads half-word 0 (H16x4) from a full real (R32x4) register - constexpr std::string_view swz_mask = "xyzw"; - const std::string reg = "r" + std::to_string(file_index); - const std::string hreg = "h" + std::to_string(file_index * 2 + word_index); - - const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")"; - const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")"; - const std::string words[] = { - "unpackHalf2x16(" + word0_bits + ")", - "unpackHalf2x16(" + word1_bits + ")" - }; - - // Assemble - std::string outputs[4]; - - ensure(word_index <= 1); - const int word_offset = word_index * 4; - for (int ch = 0; ch < 4; ++ch) - { - outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32 - ? words[ch / 2] - : hreg; - } - - // Grouping. Only replace relevant bits... - if (outputs[0] == outputs[1]) outputs[0] = ""; - if (outputs[2] == outputs[3]) outputs[2] = ""; - - // Assemble - bool group = false; - std::string result = ""; - - for (int ch = 0; ch < 4; ++ch) - { - if (outputs[ch].empty()) - { - group = true; - continue; - } - - if (!result.empty()) - { - result += ", "; - } - - if (group) - { - ensure(ch > 0); - group = false; - result += outputs[ch]; - - if (outputs[ch] == hreg) - { - result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch]; - } - continue; - } - - const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles - result += outputs[ch] + "." + swz_mask[subch]; - } - - return "(" + result + ")"; - } -} diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h deleted file mode 100644 index 6cfc8e76c3..0000000000 --- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include - -namespace rsx -{ - class MixedPrecisionRegister - { - enum data_type_bits - { - undefined = 0, - f16 = 1, - f32 = 2 - }; - - std::array content_mask; // Content details for each half-word - u32 file_index = umax; - - void tag_h0(bool x, bool y, bool z, bool w); - - void tag_h1(bool x, bool y, bool z, bool w); - - void tag_r(bool x, bool y, bool z, bool w); - - std::string fetch_halfreg(u32 word_index) const; - - public: - MixedPrecisionRegister(); - - void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w); - - std::string gather_r() const; - - std::string split_h0() const - { - return fetch_halfreg(0); - } - - std::string split_h1() const - { - return fetch_halfreg(1); - } - - // Getters - - // Return true if all values are unwritten to (undefined) - bool floating() const - { - return file_index == umax; - } - - // Return true if the first half register is all undefined - bool floating_h0() const - { - return content_mask[0] == content_mask[1] && - content_mask[1] == content_mask[2] && - content_mask[2] == content_mask[3] && - content_mask[3] == data_type_bits::undefined; - } - - // Return true if the second half register is all undefined - bool floating_h1() const - { - return content_mask[4] == content_mask[5] && - content_mask[5] == content_mask[6] && - content_mask[6] == content_mask[7] && - content_mask[7] == data_type_bits::undefined; - } - - // Return true if any of the half-words are 16-bit - bool requires_gather(u8 channel) const - { - // Data fetched from the single precision register requires merging of the two half registers - const auto channel_offset = channel * 2; - ensure(channel_offset <= 6); - - return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16); - } - - // Return true if the entire 128-bit register is filled with 2xfp16x4 data words - bool requires_gather128() const - { - // Full 128-bit check - for (const auto& ch : content_mask) - { - if (ch == data_type_bits::f16) - { - return true; - } - } - - return false; - } - - // Return true if the half-register is polluted with fp32 data - bool requires_split(u32 word_index) const - { - const u32 content_offset = word_index * 4; - for (u32 i = 0; i < 4; ++i) - { - if (content_mask[content_offset + i] == data_type_bits::f32) - { - return true; - } - } - - return false; - } - }; -} - diff --git a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h index f834b7c7f5..d93ec760e6 100644 --- a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h +++ b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h @@ -1,6 +1,7 @@ #pragma once #include "program_util.h" +#include "Assembler/FPOpcodes.h" #include #include @@ -23,76 +24,7 @@ enum register_precision RSX_FP_PRECISION_UNKNOWN = 5 // Unknown what this actually does; seems to do nothing on hwtests but then why would their compiler emit it? }; -enum fp_opcode -{ - RSX_FP_OPCODE_NOP = 0x00, // No-Operation - RSX_FP_OPCODE_MOV = 0x01, // Move - RSX_FP_OPCODE_MUL = 0x02, // Multiply - RSX_FP_OPCODE_ADD = 0x03, // Add - RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add - RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product - RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product - RSX_FP_OPCODE_DST = 0x07, // Distance - RSX_FP_OPCODE_MIN = 0x08, // Minimum - RSX_FP_OPCODE_MAX = 0x09, // Maximum - RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan - RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual - RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual - RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan - RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual - RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal - RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract) - RSX_FP_OPCODE_FLR = 0x11, // Floor - RSX_FP_OPCODE_KIL = 0x12, // Kill fragment - RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values - RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values - RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x) - RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y) - RSX_FP_OPCODE_TEX = 0x17, // Texture lookup - RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup) - RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives) - RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal - RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root - RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2 - RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2 - RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients - RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation - RSX_FP_OPCODE_STR = 0x20, // Set-If-True - RSX_FP_OPCODE_SFL = 0x21, // Set-If-False - RSX_FP_OPCODE_COS = 0x22, // Cosine - RSX_FP_OPCODE_SIN = 0x23, // Sine - RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats - RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats - RSX_FP_OPCODE_POW = 0x26, // Power - RSX_FP_OPCODE_PKB = 0x27, // Pack bytes - RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes - RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits - RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16 - RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform) - RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation - RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma - RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition - RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD - RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias - RSX_FP_OPCODE_TEXBEM = 0x33, - RSX_FP_OPCODE_TXPBEM = 0x34, - RSX_FP_OPCODE_BEMLUM = 0x35, - RSX_FP_OPCODE_REFL = 0x36, // Reflection vector - RSX_FP_OPCODE_TIMESWTEX = 0x37, - RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product - RSX_FP_OPCODE_NRM = 0x39, // Normalize - RSX_FP_OPCODE_DIV = 0x3A, // Division - RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root - RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT - RSX_FP_OPCODE_FENCT = 0x3D, // Fence T? - RSX_FP_OPCODE_FENCB = 0x3E, // Fence B? - RSX_FP_OPCODE_BRK = 0x40, // Break - RSX_FP_OPCODE_CAL = 0x41, // Subroutine call - RSX_FP_OPCODE_IFE = 0x42, // If - RSX_FP_OPCODE_LOOP = 0x43, // Loop - RSX_FP_OPCODE_REP = 0x44, // Repeat - RSX_FP_OPCODE_RET = 0x45 // Return -}; +using enum rsx::assembler::FP_opcode; union OPDEST { @@ -116,6 +48,12 @@ union OPDEST u32 no_dest : 1; u32 saturate : 1; // _sat }; + + struct + { + u32 : 9; + u32 write_mask : 4; + }; }; union SRC0 @@ -164,7 +102,7 @@ union SRC1 u32 src1_prec_mod : 3; // Precision modifier for src1 (CoD:MW series) u32 src2_prec_mod : 3; // Precision modifier for src2 (unproven, should affect MAD instruction) u32 scale : 3; - u32 opcode_is_branch : 1; + u32 opcode_hi : 1; // Opcode high bit }; struct @@ -207,6 +145,23 @@ union SRC2 }; }; +union SRC_Common +{ + u32 HEX; + + struct + { + u32 reg_type : 2; + u32 tmp_reg_index : 6; + u32 fp16 : 1; + u32 swizzle_x : 2; + u32 swizzle_y : 2; + u32 swizzle_z : 2; + u32 swizzle_w : 2; + u32 neg : 1; + }; +}; + constexpr const char* rsx_fp_input_attr_regs[] = { "WPOS", "COL0", "COL1", "FOGC", "TEX0", diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index db969c4e60..88578b2e93 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -157,8 +157,11 @@ + + - + + @@ -703,8 +706,11 @@ + + - + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 23b7ef174d..6b15f662e5 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -136,6 +136,12 @@ {d99df916-8a99-428b-869a-9f14ac0ab411} + + {d13db076-47e4-45b9-bb8a-6b711ea40622} + + + {7fb59544-9761-4b4a-bb04-07deb43cf3c2} + @@ -1354,9 +1360,6 @@ Emu\Cell - - Emu\GPU\RSX\Program - Utilities @@ -1378,6 +1381,18 @@ Emu\GPU\RSX\Program\Assembler + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler + + + Emu\GPU\RSX\Program\Assembler + @@ -2746,9 +2761,6 @@ Emu\Audio - - Emu\GPU\RSX\Program - Utilities @@ -2776,6 +2788,18 @@ Emu\GPU\RSX\Program\Assembler + + Emu\GPU\RSX\Program\Assembler + + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler + diff --git a/rpcs3/tests/rpcs3_test.vcxproj b/rpcs3/tests/rpcs3_test.vcxproj index 22992e6a07..2851f2faa6 100644 --- a/rpcs3/tests/rpcs3_test.vcxproj +++ b/rpcs3/tests/rpcs3_test.vcxproj @@ -89,6 +89,7 @@ + diff --git a/rpcs3/tests/test_rsx_cfg.cpp b/rpcs3/tests/test_rsx_cfg.cpp index 1708774d76..5e22311ac3 100644 --- a/rpcs3/tests/test_rsx_cfg.cpp +++ b/rpcs3/tests/test_rsx_cfg.cpp @@ -2,89 +2,28 @@ #include "Emu/RSX/Common/simple_array.hpp" #include "Emu/RSX/Program/Assembler/CFG.h" +#include "Emu/RSX/Program/Assembler/FPASM.h" #include "Emu/RSX/Program/RSXFragmentProgram.h" #include namespace rsx::assembler { - auto swap_bytes16 = [](u32 dword) -> u32 + static const BasicBlock* get_graph_block_by_id(const FlowGraph& graph, u32 id) { - // Lazy encode, but good enough for what we need here. - union v32 - { - u32 HEX; - u8 _v[4]; - }; - - u8* src_bytes = reinterpret_cast(&dword); - v32 dst_bytes; - - dst_bytes._v[0] = src_bytes[1]; - dst_bytes._v[1] = src_bytes[0]; - dst_bytes._v[2] = src_bytes[3]; - dst_bytes._v[3] = src_bytes[2]; - - return dst_bytes.HEX; - }; - - // Instruction mocks because we don't have a working assember (yet) - auto encode_instruction = [](u32 opcode, bool end = false) -> v128 - { - OPDEST dst{}; - dst.opcode = opcode; - - if (end) - { - dst.end = 1; - } - - return v128::from32(swap_bytes16(dst.HEX), 0, 0, 0); - }; - - auto create_if(u32 end, u32 _else = 0) - { - OPDEST dst{}; - dst.opcode = RSX_FP_OPCODE_IFE & 0x3Fu; - - SRC1 src1{}; - src1.else_offset = (_else ? _else : end) << 2; - src1.opcode_is_branch = 1; - - SRC2 src2{}; - src2.end_offset = end << 2; - - return v128::from32(swap_bytes16(dst.HEX), 0, swap_bytes16(src1.HEX), swap_bytes16(src2.HEX)); - }; - - TEST(CFG, FpToCFG_Basic) - { - rsx::simple_array buffer = { - encode_instruction(RSX_FP_OPCODE_ADD), - encode_instruction(RSX_FP_OPCODE_MOV, true) - }; - - RSXFragmentProgram program{}; - program.data = buffer.data(); - - FlowGraph graph = deconstruct_fragment_program(program); - - EXPECT_EQ(graph.blocks.size(), 1); - EXPECT_EQ(graph.blocks.front().instructions.size(), 2); - EXPECT_EQ(graph.blocks.front().instructions.front().length, 4); - EXPECT_EQ(graph.blocks.front().instructions[0].addr, 0); - EXPECT_EQ(graph.blocks.front().instructions[1].addr, 16); + auto found = std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == id)); + return &(*found); } - TEST(CFG, FpToCFG_IF) { - rsx::simple_array buffer = { - encode_instruction(RSX_FP_OPCODE_ADD), // 0 - encode_instruction(RSX_FP_OPCODE_MOV), // 1 - create_if(4), // 2 (BR, 4) - encode_instruction(RSX_FP_OPCODE_ADD), // 3 - encode_instruction(RSX_FP_OPCODE_MOV, true), // 4 (Merge block) - }; + auto ir = FPIR::from_source(R"( + ADD R0, R0, R0; + MOV R1, R0; + IF.LT; + ADD R1, R1, R0; + ENDIF; + MOV R0, R1; + )"); const std::pair expected_block_data[3] = { { 0, 3 }, // Head @@ -93,7 +32,8 @@ namespace rsx::assembler }; RSXFragmentProgram program{}; - program.data = buffer.data(); + auto bytecode = ir.compile(); + program.data = bytecode.data(); FlowGraph graph = deconstruct_fragment_program(program); @@ -108,24 +48,26 @@ namespace rsx::assembler } // Check edges - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 3))->pred[0].type, EdgeType::IF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].type, EdgeType::IF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 4))->pred[0].type, EdgeType::ENDIF); + EXPECT_EQ(get_graph_block_by_id(graph, 3)->pred[0].type, EdgeType::IF); + EXPECT_EQ(get_graph_block_by_id(graph, 0)->succ[0].type, EdgeType::IF); + EXPECT_EQ(get_graph_block_by_id(graph, 4)->pred[0].type, EdgeType::ENDIF); } TEST(CFG, FpToCFG_NestedIF) { - rsx::simple_array buffer = { - encode_instruction(RSX_FP_OPCODE_ADD), // 0 - encode_instruction(RSX_FP_OPCODE_MOV), // 1 - create_if(8), // 2 (BR, 8) - encode_instruction(RSX_FP_OPCODE_ADD), // 3 - create_if(6), // 4 (BR, 6) - encode_instruction(RSX_FP_OPCODE_MOV), // 5 - encode_instruction(RSX_FP_OPCODE_MOV), // 6 (merge block 1) - encode_instruction(RSX_FP_OPCODE_ADD), // 7 - encode_instruction(RSX_FP_OPCODE_MOV, true) // 8 (merge block 2 - }; + auto ir = FPIR::from_source( + "ADD R0, R0, R0;" // 0 + "MOV R1, R0;" // 1 + "IF.LT;" // 2 (BR, 8) + " ADD R1, R1, R0;" // 3 + " IF.GT;" // 4 (BR, 6) + " MOV R3, R0;" // 5 + " ENDIF;" + " MOV R2, R3;" // 6 (merge block 1) + " ADD R1, R2, R1;" // 7 + "ENDIF;" + "MOV R0, R1;" // 8 (merge block 2 + ); const std::pair expected_block_data[5] = { { 0, 3 }, // Head @@ -136,7 +78,8 @@ namespace rsx::assembler }; RSXFragmentProgram program{}; - program.data = buffer.data(); + auto bytecode = ir.compile(); + program.data = bytecode.data(); FlowGraph graph = deconstruct_fragment_program(program); @@ -153,17 +96,19 @@ namespace rsx::assembler TEST(CFG, FpToCFG_NestedIF_MultiplePred) { - rsx::simple_array buffer = { - encode_instruction(RSX_FP_OPCODE_ADD), // 0 - encode_instruction(RSX_FP_OPCODE_MOV), // 1 - create_if(6), // 2 (BR, 6) - encode_instruction(RSX_FP_OPCODE_ADD), // 3 - create_if(6), // 4 (BR, 6) - encode_instruction(RSX_FP_OPCODE_MOV), // 5 - encode_instruction(RSX_FP_OPCODE_MOV), // 6 (merge block) - encode_instruction(RSX_FP_OPCODE_ADD), // 7 - encode_instruction(RSX_FP_OPCODE_MOV, true) // 8 - }; + auto ir = FPIR::from_source( + "ADD R0, R0, R0;" // 0 + "MOV R1, R0;" // 1 + "IF.LT;" // 2 (BR, 6) + " ADD R1, R1, R0;" // 3 + " IF.GT;" // 4 (BR, 6) + " MOV R3, R0;" // 5 + " ENDIF;" // ENDIF (4) + "ENDIF;" // ENDIF (2) + "MOV R2, R3;" // 6 (merge block, unified) + "ADD R1, R2, R1;" // 7 + "MOV R0, R1;" // 8 + ); const std::pair expected_block_data[4] = { { 0, 3 }, // Head @@ -173,7 +118,8 @@ namespace rsx::assembler }; RSXFragmentProgram program{}; - program.data = buffer.data(); + auto bytecode = ir.compile(); + program.data = bytecode.data(); FlowGraph graph = deconstruct_fragment_program(program); @@ -187,32 +133,40 @@ namespace rsx::assembler EXPECT_EQ(it->instructions.size(), expected.second); } + const BasicBlock + *bb0 = get_graph_block_by_id(graph, 0), + *bb6 = get_graph_block_by_id(graph, 6); + // Predecessors must be ordered, closest first - ASSERT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred.size(), 2); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[0].type, EdgeType::ENDIF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[0].from->id, 3); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[1].type, EdgeType::ENDIF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[1].from->id, 0); + ASSERT_EQ(bb6->pred.size(), 3); + EXPECT_EQ(bb6->pred[0].type, EdgeType::ENDIF); + EXPECT_EQ(bb6->pred[0].from->id, 5); + EXPECT_EQ(bb6->pred[1].type, EdgeType::ENDIF); + EXPECT_EQ(bb6->pred[1].from->id, 3); + EXPECT_EQ(bb6->pred[2].type, EdgeType::ENDIF); + EXPECT_EQ(bb6->pred[2].from->id, 0); // Successors must also be ordered, closest first - ASSERT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ.size(), 2); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].type, EdgeType::IF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].to->id, 3); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[1].type, EdgeType::ENDIF); - EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[1].to->id, 6); + ASSERT_EQ(bb0->succ.size(), 2); + EXPECT_EQ(bb0->succ[0].type, EdgeType::IF); + EXPECT_EQ(bb0->succ[0].to->id, 3); + EXPECT_EQ(bb0->succ[1].type, EdgeType::ENDIF); + EXPECT_EQ(bb0->succ[1].to->id, 6); } TEST(CFG, FpToCFG_IF_ELSE) { - rsx::simple_array buffer = { - encode_instruction(RSX_FP_OPCODE_ADD), // 0 - encode_instruction(RSX_FP_OPCODE_MOV), // 1 - create_if(6, 4), // 2 (BR, 6) - encode_instruction(RSX_FP_OPCODE_ADD), // 3 - encode_instruction(RSX_FP_OPCODE_MOV), // 4 (Else) - encode_instruction(RSX_FP_OPCODE_ADD), // 5 - encode_instruction(RSX_FP_OPCODE_MOV, true), // 6 (Merge) - }; + auto ir = FPIR::from_source( + "ADD R0, R0, R0;" // 0 + "MOV R1, R0;" // 1 + "IF.LT;" // 2 (BR, 6) + " ADD R1, R1, R0;" // 3 + "ELSE;" // ELSE (2) + " MOV R2, R3;" // 4 + " ADD R1, R2, R1;" // 5 + "ENDIF;" // ENDIF (2) + "MOV R0, R1;" // 6 (merge) + ); const std::pair expected_block_data[4] = { { 0, 3 }, // Head @@ -222,7 +176,8 @@ namespace rsx::assembler }; RSXFragmentProgram program{}; - program.data = buffer.data(); + auto bytecode = ir.compile(); + program.data = bytecode.data(); FlowGraph graph = deconstruct_fragment_program(program); @@ -235,5 +190,24 @@ namespace rsx::assembler EXPECT_EQ(it->id, expected.first); EXPECT_EQ(it->instructions.size(), expected.second); } + + // The IF and ELSE branches don't link to each other directly. Their predecessor should point to both and they both point to the merge. + const BasicBlock + *bb0 = get_graph_block_by_id(graph, 0), + *bb3 = get_graph_block_by_id(graph, 3), + *bb4 = get_graph_block_by_id(graph, 4), + *bb6 = get_graph_block_by_id(graph, 6); + + EXPECT_EQ(bb0->succ.size(), 3); + EXPECT_EQ(bb3->succ.size(), 1); + EXPECT_EQ(bb4->succ.size(), 1); + + EXPECT_EQ(bb3->succ.front().to, bb6); + EXPECT_EQ(bb4->succ.front().to, bb6); + + EXPECT_EQ(bb6->pred.size(), 3); + EXPECT_EQ(bb6->pred[0].from, bb4); + EXPECT_EQ(bb6->pred[1].from, bb3); + EXPECT_EQ(bb6->pred[2].from, bb0); } } diff --git a/rpcs3/tests/test_rsx_fp_asm.cpp b/rpcs3/tests/test_rsx_fp_asm.cpp new file mode 100644 index 0000000000..704df5a23d --- /dev/null +++ b/rpcs3/tests/test_rsx_fp_asm.cpp @@ -0,0 +1,761 @@ +#include + +#include "Emu/RSX/Common/simple_array.hpp" +#include "Emu/RSX/Program/Assembler/FPASM.h" +#include "Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h" +#include "Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +namespace rsx::assembler +{ +#define DECLARE_REG32(num)\ + Register R##num{ .id = num, .f16 = false } + +#define DECLARE_REG16(num)\ + Register H##num{ .id = num, .f16 = true } + + DECLARE_REG32(0); + DECLARE_REG32(1); + DECLARE_REG32(2); + DECLARE_REG32(3); + DECLARE_REG32(4); + DECLARE_REG32(5); + DECLARE_REG32(6); + DECLARE_REG32(7); + DECLARE_REG32(8); + + DECLARE_REG16(0); + DECLARE_REG16(1); + DECLARE_REG16(2); + DECLARE_REG16(3); + DECLARE_REG16(4); + DECLARE_REG16(5); + DECLARE_REG16(6); + DECLARE_REG16(7); + DECLARE_REG16(8); + +#undef DECLARE_REG32 +#undef DECLARE_REG16 + + static const BasicBlock* get_graph_block(const FlowGraph& graph, u32 index) + { + ensure(index < graph.blocks.size()); + for (auto it = graph.blocks.begin(); it != graph.blocks.end(); ++it) + { + if (!index) + { + return &(*it); + } + index--; + } + return nullptr; + }; + + static FlowGraph CFG_from_source(const std::string& asm_) + { + auto ir = FPIR::from_source(asm_); + + FlowGraph graph{}; + graph.blocks.push_back({}); + + auto& bb = graph.blocks.back(); + bb.instructions = ir.instructions(); + return graph; + } + + TEST(TestFPIR, FromSource) + { + auto ir = FPIR::from_source(R"( + MOV R0, #{ 0.125 }; + ADD R1, R0, R0; + )"); + + const auto instructions = ir.instructions(); + + ASSERT_EQ(instructions.size(), 2); + + EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.end, 0); + EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.opcode, RSX_FP_OPCODE_MOV); + EXPECT_EQ(SRC0{ .HEX = instructions[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_CONSTANT); + EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.opcode, RSX_FP_OPCODE_MOV); + EXPECT_EQ(instructions[0].length, 8); + + EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.end, 1); + EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.opcode, RSX_FP_OPCODE_ADD); + EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.dest_reg, 1); + EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.fp16, 0); + EXPECT_EQ(SRC0{ .HEX = instructions[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(instructions[1].length, 4); + } + + TEST(TestFPIR, RegisterAnnotationPass) + { + // Code snippet reads from R0, R1 and H4, clobbers R1, H0 + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H0, H4; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 2); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + FP::RegisterAnnotationPass annotation_pass{ prog }; + + annotation_pass.run(graph); + + ASSERT_EQ(block.clobber_list.size(), 2); + ASSERT_EQ(block.input_list.size(), 3); + + EXPECT_EQ(block.clobber_list[0].reg, H0); + EXPECT_EQ(block.clobber_list[1].reg, R1); + + EXPECT_EQ(block.input_list[0].reg, H4); + EXPECT_EQ(block.input_list[1].reg, R0); + EXPECT_EQ(block.input_list[2].reg, R1); + } + + TEST(TestFPIR, RegisterAnnotationPass_MixedIO) + { + // Code snippet reads from R0, R1, clobbers R0, R1, H0. + // The H2 read does not count because R1 is clobbered. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + PK8U R0, R1; + MOV H0, H2; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + FP::RegisterAnnotationPass annotation_pass{ prog }; + + annotation_pass.run(graph); + + ASSERT_EQ(block.clobber_list.size(), 3); + ASSERT_EQ(block.input_list.size(), 2); + + EXPECT_EQ(block.clobber_list[0].reg, H0); + EXPECT_EQ(block.clobber_list[1].reg, R0); + EXPECT_EQ(block.clobber_list[2].reg, R1); + + EXPECT_EQ(block.input_list[0].reg, R0); + EXPECT_EQ(block.input_list[1].reg, R1); + } + + TEST(TestFPIR, RegisterDependencyPass_Simple16) + { + // Instruction 2 clobers R0 which in turn clobbers H0. + // Instruction 3 reads from H0 so a barrier16 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + PK8U R0, R1; + MOV H2, H0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 5); + + // H0.xy = unpackHalf2(r0.x); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 1); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0); + + // H0.zw = unpackHalf2(r0.y); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 1); + } + + TEST(TestFPIR, RegisterDependencyPass_Simple32) + { + // Instruction 2 clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1, R1 + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 5); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.dest_reg, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_y, 1); + + // R0.w = packHalf2(H1.zw); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.dest_reg, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 2); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_y, 3); + } + + TEST(TestFPIR, RegisterDependencyPass_Complex_IF_BothPredecessorsClobber) + { + // Multi-level but only single IF + // Mockup of a simple lighting function, R0 = Light vector, R1 = Decompressed normal. DP4 used for simplicity. + // Data hazards sprinkled in for testing. R3 is clobbered in the ancestor and the IF branch. + // Barrier should go in the IF branch here. + auto ir = FPIR::from_source(R"( + DP4 R2, R0, R1 + SFL R3 + SGT R3, R2, R0 + IF.GE + ADD R0, R0, R2 + MOV H6, #{ 0.25 } + ENDIF + ADD R0, R0, R3 + MOV R1, R0 + )"); + + auto bytecode = ir.compile(); + + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + + auto graph = deconstruct_fragment_program(prog); + auto bb0 = get_graph_block(graph, 0); + auto bb1 = get_graph_block(graph, 1); + auto bb2 = get_graph_block(graph, 2); + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(bb0->instructions.size(), 4); + ASSERT_EQ(bb1->instructions.size(), 2); + ASSERT_EQ(bb2->instructions.size(), 2); + + // bb1 has a epilogue + ASSERT_EQ(bb1->epilogue.size(), 2); + + // bb1 epilogue updates R3.xy + + // R3.x = packHalf2(H6.xy) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_x, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_x, 0); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_y, 1); + + // R3.y = packHalf2(H6.zw) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_y, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_x, 2); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_y, 3); + } + + TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_OneBranchClobbers) + { + // Single IF-ELSE, if clobbers, ELSE does not + auto ir = FPIR::from_source(R"( + DP4 R2, R0, R1 + SFL R3 + SGT R3, R2, R0 + IF.GE + ADD R0, R0, R2 + MOV H6, #{ 0.25 } + ELSE + ADD R0, R0, R1 + ENDIF + ADD R0, R0, R3 + MOV R1, R0 + )"); + + auto bytecode = ir.compile(); + + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + ASSERT_EQ(graph.blocks.size(), 4); + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + const BasicBlock + *bb0 = get_graph_block(graph, 0), + *bb1 = get_graph_block(graph, 1), + *bb2 = get_graph_block(graph, 2), + *bb3 = get_graph_block(graph, 3); + + ASSERT_EQ(bb0->instructions.size(), 4); + ASSERT_EQ(bb1->instructions.size(), 2); + ASSERT_EQ(bb2->instructions.size(), 1); + ASSERT_EQ(bb3->instructions.size(), 2); + + // bb1 has a epilogue + ASSERT_EQ(bb0->epilogue.size(), 0); + ASSERT_EQ(bb1->epilogue.size(), 2); + ASSERT_EQ(bb2->epilogue.size(), 0); + + // bb1 epilogue updates R3.xy + + // R3.x = packHalf2(H6.xy) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_x, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_x, 0); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_y, 1); + + // R3.y = packHalf2(H6.zw) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_y, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_x, 2); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_y, 3); + } + + + TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_Simpsons) + { + // Complex IF-ELSE nest observed in Simpson's game. Rewritten for simplicity. + // There is no tail block. No epilogues should be injected in this scenario since H4 (the trigger) is defined on all branches. + // R2 is indeed clobbered but the outer ELSE branch should not be able to see the inner IF-ELSE blocks as predecessors. + auto ir = FPIR::from_source(R"( + MOV R2, #{ 0.25 }; + IF.GT; + SLT R4, H2, #{ 0.125 }; + IF.GT; + ADD H2, H0, H3; + FMA H4, R2, H2, H3; + ELSE; + MOV H2, #{ 0.125 }; + ADD H0, H0, H2; + FMA H4, R2, H2, H3; + ENDIF; + ELSE; + FMA H4, R2, H2, H3; + MOV H0, H4; + ENDIF; + )"); + + auto bytecode = ir.compile(); + + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + ASSERT_EQ(graph.blocks.size(), 6); + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + const BasicBlock + *bb0 = get_graph_block(graph, 0), + *bb1 = get_graph_block(graph, 1), + *bb2 = get_graph_block(graph, 2), + *bb3 = get_graph_block(graph, 3), + *bb4 = get_graph_block(graph, 4), + *bb5 = get_graph_block(graph, 5); + + // Sanity + EXPECT_EQ(bb0->instructions.size(), 2); + EXPECT_EQ(bb1->instructions.size(), 2); + EXPECT_EQ(bb2->instructions.size(), 2); + EXPECT_EQ(bb3->instructions.size(), 3); + EXPECT_EQ(bb4->instructions.size(), 2); + EXPECT_EQ(bb5->instructions.size(), 0); // Phi/Merge only. + + // Nested children must recursively fall out to the closest ENDIF + ASSERT_EQ(bb4->pred.size(), 1); + EXPECT_EQ(bb4->pred.front().type, EdgeType::ELSE); + EXPECT_EQ(bb5->pred.size(), 4); // 2 IF and 2 ELSE paths exist + + // Check that we get no epilogues + EXPECT_EQ(bb0->epilogue.size(), 0); + EXPECT_EQ(bb1->epilogue.size(), 0); + EXPECT_EQ(bb2->epilogue.size(), 0); + EXPECT_EQ(bb3->epilogue.size(), 0); + EXPECT_EQ(bb4->epilogue.size(), 0); + EXPECT_EQ(bb5->epilogue.size(), 0); + } + + TEST(TestFPIR, RegisterDependencyPass_Partial32_0) + { + // Instruction 2 partially clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a partial barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1.x, R1.x; + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 4); + + OPDEST dst{ .HEX = block.instructions[2].bytecode[0] }; + SRC0 src0{ .HEX = block.instructions[2].bytecode[1] }; + SRC1 src1{ .HEX = block.instructions[2].bytecode[2] }; + + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_LO); + EXPECT_EQ(dst.fp16, 0); + EXPECT_EQ(dst.dest_reg, 0); + EXPECT_EQ(dst.mask_x, false); + EXPECT_EQ(dst.mask_y, false); + EXPECT_EQ(dst.mask_z, true); + EXPECT_EQ(dst.mask_w, false); + EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src0.tmp_reg_index, 0); + EXPECT_EQ(src0.fp16, 0); + EXPECT_EQ(src0.swizzle_x, 2); + EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src1.tmp_reg_index, 1); + EXPECT_EQ(src1.fp16, 1); + EXPECT_EQ(src1.swizzle_x, 0); + } + + TEST(TestFPIR, RegisterDependencyPass_Partial32_1) + { + // Instruction 2 partially clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a partial barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1.y, R1.y; + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 4); + + OPDEST dst{ .HEX = block.instructions[2].bytecode[0] }; + SRC0 src0{ .HEX = block.instructions[2].bytecode[1] }; + SRC1 src1{ .HEX = block.instructions[2].bytecode[2] }; + + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_HI); + EXPECT_EQ(dst.fp16, 0); + EXPECT_EQ(dst.dest_reg, 0); + EXPECT_EQ(dst.mask_x, false); + EXPECT_EQ(dst.mask_y, false); + EXPECT_EQ(dst.mask_z, true); + EXPECT_EQ(dst.mask_w, false); + EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src0.tmp_reg_index, 0); + EXPECT_EQ(src0.fp16, 0); + EXPECT_EQ(src0.swizzle_x, 2); + EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src1.tmp_reg_index, 1); + EXPECT_EQ(src1.fp16, 1); + EXPECT_EQ(src1.swizzle_x, 1); + } + + TEST(TestFPIR, RegisterDependencyPass_SkipDelaySlots) + { + // Instruction 2 clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 but is a delay slot that does nothing and can be NOPed. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1, R1 + MOV R0, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog, { .skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // Delay slot detection will cause no dependency injection + ASSERT_EQ(block.instructions.size(), 3); + } + + TEST(TestFPIR, RegisterDependencyPass_Skip_IF_ELSE_Ancestors) + { + // R4/H8 is clobbered but an IF-ELSE chain follows it. + // Merge block reads H8, but since both IF-ELSE legs resolve the dependency, we do not need a barrier for H8. + // H6 is included as a control. + auto ir = FPIR::from_source(R"( + MOV R4, #{ 0.25 } + MOV H6.x, #{ 0.125 } + IF.LT + MOV H8, #{ 0.0 } + ELSE + MOV H8, #{ 0.25 } + ENDIF + ADD R0, R3, H8 + )"); + + auto bytecode = ir.compile(); + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + // Verify state before + ASSERT_EQ(graph.blocks.size(), 4); + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 3); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1); + + FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // We get one barrier on R3 (H6) but nont for R4 (H8) + EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 1); + + // No intra-block barriers + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 3); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1); + } + + TEST(TestFPIR, RegisterDependencyPass_Process_IF_Ancestors) + { + // H8.x is clobbered but only an IF sequence follows with no ELSE. + // Merge block reads r4.x, but since both IF-ELSE legs resolve the dependency, we do not need a barrier. + auto ir = FPIR::from_source(R"( + MOV H8.x, #{ 0.25 } + IF.LT + MOV R4.x, #{ 0.0 } + ENDIF + MOV R0, R4 + )"); + + auto bytecode = ir.compile(); + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + // Verify state before + ASSERT_EQ(graph.blocks.size(), 3); + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1); + + FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // A barrier will be inserted into block 0 epilogue + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1); + + EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 1); + EXPECT_EQ(get_graph_block(graph, 1)->epilogue.size(), 0); + EXPECT_EQ(get_graph_block(graph, 2)->epilogue.size(), 0); + } + + TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_Ancestor_Clobber) + { + // 2 clobbered registers up the chain. + // 1 full barrier is needed for R4 (4 instructions) + auto ir = FPIR::from_source(R"( + MOV R4, #{ 0.0 } + IF.LT + MOV H9, #{ 0.25 } + ENDIF + MOV H8, #{ 0.25 } + IF.LT + IF.GT + ADD R0, R0, R0 + ELSE + ADD R0, R1, R0 + ENDIF + ENDIF + ADD R0, R0, R4 + )"); + + auto bytecode = ir.compile(); + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + // Verify state before + ASSERT_EQ(graph.blocks.size(), 7); + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 4)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 5)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 6)->instructions.size(), 1); + + FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // Full-lane barrier on writing blocks + EXPECT_EQ(get_graph_block(graph, 1)->epilogue.size(), 2); + EXPECT_EQ(get_graph_block(graph, 2)->epilogue.size(), 2); + + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 2); + EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 4)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 5)->instructions.size(), 1); + EXPECT_EQ(get_graph_block(graph, 6)->instructions.size(), 1); + } + + TEST(TestFPIR, RegisterDependencyPass_SplinterCell_DelaySlot) + { + // Real shader pattern found in splinter cell blacklist. + // TEX instructions replaced with MOV for simplicity. + // There are no dependent reads here, no barriers are expected. + // In the game, instruction 4 was misclassified as a delay slot, causing a skipped clobber. + auto ir = FPIR::from_source(R"( + MOV R0.w, #{ 0.25 } + MOV H0, H8 + MUL R0.w, H0.w, R0.w + MOV R0.xyz, H0.xyz + MOV R1, #{ 0.25 } + FMA H0, R0, #{ 0.125 }, R1 + )"); + + auto bytecode = ir.compile(); + RSXFragmentProgram prog{}; + prog.data = bytecode.data(); + auto graph = deconstruct_fragment_program(prog); + + // Verify state before + ASSERT_EQ(graph.blocks.size(), 1); + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 6); + + FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // Verify state after + EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 6); + EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 0); + } +} diff --git a/rpcs3/tests/test_simple_array.cpp b/rpcs3/tests/test_simple_array.cpp index 8d64599b96..ebedff861d 100644 --- a/rpcs3/tests/test_simple_array.cpp +++ b/rpcs3/tests/test_simple_array.cpp @@ -324,6 +324,40 @@ namespace rsx EXPECT_EQ(arr.find_if(FN(x == 99)), nullptr); } + TEST(SimpleArray, InsertArray) + { + rsx::simple_array arr{ + 0, 1, 2, 6, 7, 8, 9 + }; + + const std::vector tail{ + 10, 11, 12 + }; + + const std::vector mid{ + 3, 4, 5 + }; + + // Insert end + arr.insert(arr.end(), tail); + EXPECT_EQ(arr.size(), 10); + + // Insert mid + auto it = arr.begin(); + std::advance(it, 3); + it = arr.insert(it, mid); + + EXPECT_EQ(arr.size(), 13); + EXPECT_EQ(std::distance(arr.begin(), it), 3); + EXPECT_EQ(*it, 3); + + // Verify + for (unsigned i = 0; i < arr.size(); ++i) + { + EXPECT_EQ(arr[i], static_cast(i)); + } + } + TEST(AlignedAllocator, Alloc) { auto ptr = rsx::aligned_allocator::malloc<256>(16);