mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-12-16 04:09:07 +00:00
Merge branch 'feat/clans' of github.com:zephyrcodesstuff/rpcs3 into feat/clans
Signed-off-by: zeph <zephyrzefa15@gmail.com>
This commit is contained in:
commit
6afa8085be
@ -193,6 +193,7 @@ if(BUILD_RPCS3_TESTS)
|
||||
tests/test_simple_array.cpp
|
||||
tests/test_address_range.cpp
|
||||
tests/test_rsx_cfg.cpp
|
||||
tests/test_rsx_fp_asm.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(rpcs3_test
|
||||
|
||||
@ -518,12 +518,15 @@ target_sources(rpcs3_emu PRIVATE
|
||||
RSX/Overlays/overlay_video.cpp
|
||||
RSX/Overlays/Shaders/shader_loading_dialog.cpp
|
||||
RSX/Overlays/Shaders/shader_loading_dialog_native.cpp
|
||||
RSX/Program/Assembler/FPASM.cpp
|
||||
RSX/Program/Assembler/FPOpcodes.cpp
|
||||
RSX/Program/Assembler/FPToCFG.cpp
|
||||
RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp
|
||||
RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp
|
||||
RSX/Program/CgBinaryProgram.cpp
|
||||
RSX/Program/CgBinaryFragmentProgram.cpp
|
||||
RSX/Program/CgBinaryVertexProgram.cpp
|
||||
RSX/Program/FragmentProgramDecompiler.cpp
|
||||
RSX/Program/FragmentProgramRegister.cpp
|
||||
RSX/Program/GLSLCommon.cpp
|
||||
RSX/Program/ProgramStateCache.cpp
|
||||
RSX/Program/program_util.cpp
|
||||
|
||||
@ -337,7 +337,7 @@ namespace rsx
|
||||
AUDIT(_loc < _size);
|
||||
|
||||
const auto remaining = (_size - _loc);
|
||||
memmove(pos + 1, pos, remaining * sizeof(Ty));
|
||||
std::memmove(pos + 1, pos, remaining * sizeof(Ty));
|
||||
|
||||
*pos = val;
|
||||
_size++;
|
||||
@ -365,7 +365,7 @@ namespace rsx
|
||||
AUDIT(_loc < _size);
|
||||
|
||||
const u32 remaining = (_size - _loc);
|
||||
memmove(pos + 1, pos, remaining * sizeof(Ty));
|
||||
std::memmove(pos + 1, pos, remaining * sizeof(Ty));
|
||||
|
||||
*pos = val;
|
||||
_size++;
|
||||
@ -373,6 +373,31 @@ namespace rsx
|
||||
return pos;
|
||||
}
|
||||
|
||||
iterator insert(iterator where, span_like<Ty> auto const& values)
|
||||
{
|
||||
ensure(where >= _data);
|
||||
const auto _loc = offset(where);
|
||||
const auto in_size = static_cast<u32>(values.size());
|
||||
const auto in_size_bytes = in_size * sizeof(Ty);
|
||||
|
||||
reserve(_size + in_size);
|
||||
|
||||
if (_loc >= _size)
|
||||
{
|
||||
where = _data + _size;
|
||||
std::memcpy(where, values.data(), in_size_bytes);
|
||||
_size += in_size;
|
||||
return where;
|
||||
}
|
||||
|
||||
const u32 remaining_bytes = (_size - _loc) * sizeof(Ty);
|
||||
where = _data + _loc;
|
||||
std::memmove(where + in_size, where, remaining_bytes);
|
||||
std::memmove(where, values.data(), in_size_bytes);
|
||||
_size += in_size;
|
||||
return where;
|
||||
}
|
||||
|
||||
void operator += (const rsx::simple_array<Ty>& that)
|
||||
{
|
||||
const auto old_size = _size;
|
||||
|
||||
@ -34,6 +34,11 @@ namespace rsx::assembler
|
||||
}
|
||||
};
|
||||
|
||||
struct CFGPass
|
||||
{
|
||||
virtual void run(FlowGraph& graph) = 0;
|
||||
};
|
||||
|
||||
FlowGraph deconstruct_fragment_program(const RSXFragmentProgram& prog);
|
||||
}
|
||||
|
||||
|
||||
455
rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp
Normal file
455
rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp
Normal file
@ -0,0 +1,455 @@
|
||||
#include "stdafx.h"
|
||||
#include "FPASM.h"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
#include <stack>
|
||||
|
||||
#ifndef _WIN32
|
||||
#define sscanf_s sscanf
|
||||
#endif
|
||||
|
||||
namespace rsx::assembler
|
||||
{
|
||||
struct FP_opcode_encoding_t
|
||||
{
|
||||
FP_opcode op;
|
||||
bool exec_if_lt;
|
||||
bool exec_if_eq;
|
||||
bool exec_if_gt;
|
||||
bool set_cond;
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string_view, FP_opcode_encoding_t> s_opcode_lookup
|
||||
{
|
||||
// Arithmetic
|
||||
{ "NOP", { .op = RSX_FP_OPCODE_NOP, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "MOV", { .op = RSX_FP_OPCODE_MOV, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "MUL", { .op = RSX_FP_OPCODE_MUL, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "ADD", { .op = RSX_FP_OPCODE_ADD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "MAD", { .op = RSX_FP_OPCODE_MAD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "FMA", { .op = RSX_FP_OPCODE_MAD, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "DP3", { .op = RSX_FP_OPCODE_DP3, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "DP4", { .op = RSX_FP_OPCODE_DP4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
|
||||
// Constant load
|
||||
{ "SFL", {.op = RSX_FP_OPCODE_SFL, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "STR", {.op = RSX_FP_OPCODE_STR, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
|
||||
// Pack-unpack operations are great for testing dependencies
|
||||
{ "PKH", { .op = RSX_FP_OPCODE_PK2, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "UPH", { .op = RSX_FP_OPCODE_UP2, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "PK16U", { .op = RSX_FP_OPCODE_PK16, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "UP16U", { .op = RSX_FP_OPCODE_UP16, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "PK8U", { .op = RSX_FP_OPCODE_PKB, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "UP8U", { .op = RSX_FP_OPCODE_UPB, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "PK8G", { .op = RSX_FP_OPCODE_PKG, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "UP8G", { .op = RSX_FP_OPCODE_UPG, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "PK8S", { .op = RSX_FP_OPCODE_PK4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "UP8S", { .op = RSX_FP_OPCODE_UP4, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
|
||||
// Basic conditionals
|
||||
{ "IF.LT", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = true, .exec_if_eq = false, .exec_if_gt = false, .set_cond = false } },
|
||||
{ "IF.LE", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = true, .exec_if_eq = true, .exec_if_gt = false, .set_cond = false } },
|
||||
{ "IF.EQ", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = true, .exec_if_gt = false, .set_cond = false } },
|
||||
{ "IF.GE", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = true, .exec_if_gt = true, .set_cond = false } },
|
||||
{ "IF.GT", { .op = RSX_FP_OPCODE_IFE, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = true, .set_cond = false } },
|
||||
|
||||
{ "SLT", { .op = RSX_FP_OPCODE_SLT, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } },
|
||||
{ "SEQ", { .op = RSX_FP_OPCODE_SEQ, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } },
|
||||
{ "SGT", { .op = RSX_FP_OPCODE_SGT, .exec_if_lt = false, .exec_if_eq = false, .exec_if_gt = false, .set_cond = true } },
|
||||
|
||||
// TODO: Add more
|
||||
|
||||
};
|
||||
|
||||
Instruction* FPIR::load(const RegisterRef& ref, int operand, Instruction* prev)
|
||||
{
|
||||
Instruction* target = prev;
|
||||
if (!target)
|
||||
{
|
||||
m_instructions.push_back({});
|
||||
target = &m_instructions.back();
|
||||
}
|
||||
|
||||
SRC_Common src{ .HEX = target->bytecode[operand + 1] };
|
||||
src.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
|
||||
src.fp16 = ref.reg.f16 ? 1 : 0;
|
||||
src.tmp_reg_index = static_cast<u32>(ref.reg.id);
|
||||
|
||||
src.swizzle_x = 0;
|
||||
src.swizzle_y = 1;
|
||||
src.swizzle_z = 2;
|
||||
src.swizzle_w = 3;
|
||||
|
||||
target->bytecode[operand + 1] = src.HEX;
|
||||
return target;
|
||||
}
|
||||
|
||||
Instruction* FPIR::load(const std::array<f32, 4>& constants, int operand, Instruction* prev)
|
||||
{
|
||||
Instruction* target = prev;
|
||||
if (!target)
|
||||
{
|
||||
m_instructions.push_back({});
|
||||
target = &m_instructions.back();
|
||||
}
|
||||
|
||||
// Unsupported for now
|
||||
ensure(target->length == 4, "FPIR cannot encode more than one constant load per instruction");
|
||||
|
||||
SRC_Common src{ .HEX = target->bytecode[operand + 1] };
|
||||
src.reg_type = RSX_FP_REGISTER_TYPE_CONSTANT;
|
||||
target->bytecode[operand + 1] = src.HEX;
|
||||
|
||||
src.swizzle_x = 0;
|
||||
src.swizzle_y = 1;
|
||||
src.swizzle_z = 2;
|
||||
src.swizzle_w = 3;
|
||||
|
||||
// Embed literal constant
|
||||
std::memcpy(&target->bytecode[4], constants.data(), 4 * sizeof(u32));
|
||||
target->length = 8;
|
||||
return target;
|
||||
}
|
||||
|
||||
Instruction* FPIR::store(const RegisterRef& ref, Instruction* prev)
|
||||
{
|
||||
Instruction* target = prev;
|
||||
if (!target)
|
||||
{
|
||||
m_instructions.push_back({});
|
||||
target = &m_instructions.back();
|
||||
}
|
||||
|
||||
OPDEST dst{ .HEX = target->bytecode[0] };
|
||||
dst.dest_reg = static_cast<u32>(ref.reg.id);
|
||||
dst.fp16 = ref.reg.f16 ? 1 : 0;
|
||||
dst.write_mask = ref.mask;
|
||||
dst.prec = ref.reg.f16 ? RSX_FP_PRECISION_HALF : RSX_FP_PRECISION_REAL;
|
||||
|
||||
target->bytecode[0] = dst.HEX;
|
||||
return target;
|
||||
}
|
||||
|
||||
void FPIR::mov(const RegisterRef& dst, f32 constant)
|
||||
{
|
||||
Instruction* inst = store(dst);
|
||||
inst = load(std::array<f32, 4>{ constant, constant, constant, constant }, 0);
|
||||
inst->opcode = RSX_FP_OPCODE_MOV;
|
||||
}
|
||||
|
||||
void FPIR::mov(const RegisterRef& dst, const RegisterRef& src)
|
||||
{
|
||||
Instruction* inst = store(dst);
|
||||
inst = load(src, 0);
|
||||
inst->opcode = RSX_FP_OPCODE_MOV;
|
||||
}
|
||||
|
||||
void FPIR::add(const RegisterRef& dst, const std::array<f32, 4>& constants)
|
||||
{
|
||||
Instruction* inst = store(dst);
|
||||
inst = load(constants, 0);
|
||||
inst->opcode = RSX_FP_OPCODE_ADD;
|
||||
}
|
||||
|
||||
void FPIR::add(const RegisterRef& dst, const RegisterRef& src)
|
||||
{
|
||||
Instruction* inst = store(dst);
|
||||
inst = load(src, 0);
|
||||
inst->opcode = RSX_FP_OPCODE_ADD;
|
||||
}
|
||||
|
||||
const std::vector<Instruction>& FPIR::instructions() const
|
||||
{
|
||||
return m_instructions;
|
||||
}
|
||||
|
||||
std::vector<u32> FPIR::compile() const
|
||||
{
|
||||
std::vector<u32> result;
|
||||
result.reserve(m_instructions.size() * 4);
|
||||
|
||||
for (const auto& inst : m_instructions)
|
||||
{
|
||||
const auto src = reinterpret_cast<const be_t<u16>*>(inst.bytecode);
|
||||
for (u32 j = 0; j < inst.length; ++j)
|
||||
{
|
||||
const u16 low = src[j * 2];
|
||||
const u16 hi = src[j * 2 + 1];
|
||||
const u32 word = static_cast<u16>(low) | (static_cast<u32>(hi) << 16u);
|
||||
result.push_back(word);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FPIR FPIR::from_source(std::string_view asm_)
|
||||
{
|
||||
std::vector<std::string> instructions = fmt::split(asm_, { "\n", ";" });
|
||||
if (instructions.empty())
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
auto transform_inst = [](std::string_view s)
|
||||
{
|
||||
std::string result;
|
||||
result.reserve(s.size());
|
||||
|
||||
bool literal = false;
|
||||
for (const auto& c : s)
|
||||
{
|
||||
if (c == ' ')
|
||||
{
|
||||
if (!literal && !result.empty() && result.back() != ',')
|
||||
{
|
||||
result += ','; // Replace token separator space with comma
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (std::isspace(c))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!literal && c == '{')
|
||||
{
|
||||
literal = true;
|
||||
}
|
||||
|
||||
if (literal && c == '}')
|
||||
{
|
||||
literal = false;
|
||||
}
|
||||
|
||||
if (c == ',')
|
||||
{
|
||||
result += (literal ? '|' : ',');
|
||||
continue;
|
||||
}
|
||||
|
||||
result += c;
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
auto decode_instruction = [&](std::string_view inst, std::string& op, std::string& dst, std::vector<std::string>& sources)
|
||||
{
|
||||
const auto i = transform_inst(inst);
|
||||
if (i.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const auto tokens = fmt::split(i, { "," });
|
||||
ensure(!tokens.empty(), "Invalid input");
|
||||
|
||||
op = tokens.front();
|
||||
|
||||
if (tokens.size() > 1)
|
||||
{
|
||||
dst = tokens[1];
|
||||
}
|
||||
|
||||
for (size_t n = 2; n < tokens.size(); ++n)
|
||||
{
|
||||
sources.push_back(tokens[n]);
|
||||
}
|
||||
};
|
||||
|
||||
auto get_ref = [](std::string_view reg)
|
||||
{
|
||||
ensure(reg.length() > 1, "Invalid register specifier");
|
||||
|
||||
const auto parts = fmt::split(reg, { "." });
|
||||
ensure(parts.size() > 0 && parts.size() <= 2);
|
||||
|
||||
const auto index = std::stoi(parts[0].substr(1));
|
||||
RegisterRef ref
|
||||
{
|
||||
.reg { .id = index, .f16 = false },
|
||||
.mask = 0x0F
|
||||
};
|
||||
|
||||
if (parts.size() > 1 && parts[1].length() > 0)
|
||||
{
|
||||
// FIXME: No swizzles for now, just lane masking
|
||||
ref.mask = 0;
|
||||
if (parts[1].find("x") != std::string::npos) ref.mask |= (1u << 0);
|
||||
if (parts[1].find("y") != std::string::npos) ref.mask |= (1u << 1);
|
||||
if (parts[1].find("z") != std::string::npos) ref.mask |= (1u << 2);
|
||||
if (parts[1].find("w") != std::string::npos) ref.mask |= (1u << 3);
|
||||
}
|
||||
|
||||
if (reg[0] == 'H' || reg[0] == 'h')
|
||||
{
|
||||
ref.reg.f16 = true;
|
||||
}
|
||||
|
||||
return ref;
|
||||
};
|
||||
|
||||
auto get_constants = [](std::string_view reg) -> std::array<f32, 4>
|
||||
{
|
||||
float x, y, z, w;
|
||||
if (sscanf_s(reg.data(), "#{%f|%f|%f|%f}", &x, &y, &z, &w) == 4)
|
||||
{
|
||||
return { x, y, z, w };
|
||||
}
|
||||
|
||||
if (sscanf_s(reg.data(), "#{%f}", &x) == 1)
|
||||
{
|
||||
return { x, x, x, x };
|
||||
}
|
||||
|
||||
fmt::throw_exception("Invalid constant literal");
|
||||
};
|
||||
|
||||
auto encode_branch_else = [](Instruction* inst, u32 end)
|
||||
{
|
||||
SRC1 src1{ .HEX = inst->bytecode[2] };
|
||||
src1.else_offset = static_cast<u32>(end);
|
||||
inst->bytecode[2] = src1.HEX;
|
||||
};
|
||||
|
||||
auto encode_branch_end = [](Instruction *inst, u32 end)
|
||||
{
|
||||
SRC2 src2 { .HEX = inst->bytecode[3] };
|
||||
src2.end_offset = static_cast<u32>(end);
|
||||
inst->bytecode[3] = src2.HEX;
|
||||
|
||||
SRC1 src1{ .HEX = inst->bytecode[2] };
|
||||
if (!src1.else_offset)
|
||||
{
|
||||
src1.else_offset = static_cast<u32>(end);
|
||||
inst->bytecode[2] = src1.HEX;
|
||||
}
|
||||
};
|
||||
|
||||
auto encode_opcode = [](std::string_view op, Instruction* inst)
|
||||
{
|
||||
OPDEST d0 { .HEX = inst->bytecode[0] };
|
||||
SRC0 s0 { .HEX = inst->bytecode[1] };
|
||||
SRC1 s1 { .HEX = inst->bytecode[2] };
|
||||
|
||||
const auto found = s_opcode_lookup.find(op);
|
||||
if (found == s_opcode_lookup.end())
|
||||
{
|
||||
fmt::throw_exception("Unhandled instruction '%s'", op);
|
||||
}
|
||||
const auto& encoding = found->second;
|
||||
|
||||
inst->opcode = encoding.op;
|
||||
d0.opcode = encoding.op & 0x3F;
|
||||
s1.opcode_hi = (encoding.op > 0x3F)? 1 : 0;
|
||||
s0.exec_if_eq = encoding.exec_if_eq ? 1 : 0;
|
||||
s0.exec_if_gr = encoding.exec_if_gt ? 1 : 0;
|
||||
s0.exec_if_lt = encoding.exec_if_lt ? 1 : 0;
|
||||
d0.set_cond = encoding.set_cond ? 1 : 0;
|
||||
inst->bytecode[0] = d0.HEX;
|
||||
inst->bytecode[1] = s0.HEX;
|
||||
inst->bytecode[2] = s1.HEX;
|
||||
};
|
||||
|
||||
std::string op, dst;
|
||||
std::vector<std::string> sources;
|
||||
|
||||
std::stack<size_t> if_ops;
|
||||
std::stack<size_t> loop_ops;
|
||||
u32 pc = 0;
|
||||
|
||||
FPIR ir{};
|
||||
|
||||
for (const auto& instruction : instructions)
|
||||
{
|
||||
op.clear();
|
||||
dst.clear();
|
||||
sources.clear();
|
||||
decode_instruction(instruction, op, dst, sources);
|
||||
|
||||
if (op.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (op.starts_with("IF."))
|
||||
{
|
||||
if_ops.push(ir.m_instructions.size());
|
||||
}
|
||||
else if (op == "LOOP")
|
||||
{
|
||||
loop_ops.push(ir.m_instructions.size());
|
||||
}
|
||||
else if (op == "ELSE")
|
||||
{
|
||||
ensure(!if_ops.empty());
|
||||
encode_branch_else(&ir.m_instructions[if_ops.top()], pc);
|
||||
continue;
|
||||
}
|
||||
else if (op == "ENDIF")
|
||||
{
|
||||
ensure(!if_ops.empty());
|
||||
encode_branch_end(&ir.m_instructions[if_ops.top()], pc);
|
||||
if_ops.pop();
|
||||
continue;
|
||||
}
|
||||
else if (op == "ENDLOOP")
|
||||
{
|
||||
ensure(!loop_ops.empty());
|
||||
encode_branch_end(&ir.m_instructions[loop_ops.top()], pc);
|
||||
loop_ops.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
ir.m_instructions.push_back({});
|
||||
Instruction* target = &ir.m_instructions.back();
|
||||
pc += 4;
|
||||
|
||||
encode_opcode(op, target);
|
||||
ensure(sources.size() == FP::get_operand_count(static_cast<FP_opcode>(target->opcode)), "Invalid operand count for opcode");
|
||||
|
||||
if (dst.empty())
|
||||
{
|
||||
OPDEST dst{ .HEX = target->bytecode[0] };
|
||||
dst.no_dest = 1;
|
||||
target->bytecode[0] = dst.HEX;
|
||||
}
|
||||
else
|
||||
{
|
||||
ir.store(get_ref(dst), target);
|
||||
}
|
||||
|
||||
int operand = 0;
|
||||
bool has_literal = false;
|
||||
for (const auto& source : sources)
|
||||
{
|
||||
if (source.front() == '#')
|
||||
{
|
||||
const auto literal = get_constants(source);
|
||||
ir.load(literal, operand++, target);
|
||||
has_literal = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
ir.load(get_ref(source), operand++, target);
|
||||
}
|
||||
|
||||
if (has_literal)
|
||||
{
|
||||
pc += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ir.m_instructions.empty())
|
||||
{
|
||||
OPDEST d0{ .HEX = ir.m_instructions.back().bytecode[0] };
|
||||
d0.end = 1;
|
||||
|
||||
ir.m_instructions.back().bytecode[0] = d0.HEX;
|
||||
}
|
||||
|
||||
return ir;
|
||||
}
|
||||
}
|
||||
29
rpcs3/Emu/RSX/Program/Assembler/FPASM.h
Normal file
29
rpcs3/Emu/RSX/Program/Assembler/FPASM.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "IR.h"
|
||||
|
||||
namespace rsx::assembler
|
||||
{
|
||||
class FPIR
|
||||
{
|
||||
public:
|
||||
void mov(const RegisterRef& dst, f32 constant);
|
||||
void mov(const RegisterRef& dst, const RegisterRef& src);
|
||||
|
||||
void add(const RegisterRef& dst, const std::array<f32, 4>& constants);
|
||||
void add(const RegisterRef& dst, const RegisterRef& src);
|
||||
|
||||
const std::vector<Instruction>& instructions() const;
|
||||
std::vector<u32> compile() const;
|
||||
|
||||
static FPIR from_source(std::string_view asm_);
|
||||
|
||||
private:
|
||||
Instruction* load(const RegisterRef& reg, int operand, Instruction* target = nullptr);
|
||||
Instruction* load(const std::array<f32, 4>& constants, int operand, Instruction* target = nullptr);
|
||||
Instruction* store(const RegisterRef& reg, Instruction* target = nullptr);
|
||||
|
||||
std::vector<Instruction> m_instructions;
|
||||
};
|
||||
}
|
||||
|
||||
428
rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp
Normal file
428
rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp
Normal file
@ -0,0 +1,428 @@
|
||||
#include "stdafx.h"
|
||||
#include "FPOpcodes.h"
|
||||
|
||||
#include "Emu/RSX/Common/simple_array.hpp"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
#include <unordered_set>
|
||||
|
||||
namespace rsx::assembler::FP
|
||||
{
|
||||
u8 get_operand_count(FP_opcode opcode)
|
||||
{
|
||||
switch (opcode)
|
||||
{
|
||||
case RSX_FP_OPCODE_NOP:
|
||||
return 0;
|
||||
case RSX_FP_OPCODE_MOV:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_MUL:
|
||||
case RSX_FP_OPCODE_ADD:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_MAD:
|
||||
return 3;
|
||||
case RSX_FP_OPCODE_DP3:
|
||||
case RSX_FP_OPCODE_DP4:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_DST:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_MIN:
|
||||
case RSX_FP_OPCODE_MAX:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_SLT:
|
||||
case RSX_FP_OPCODE_SGE:
|
||||
case RSX_FP_OPCODE_SLE:
|
||||
case RSX_FP_OPCODE_SGT:
|
||||
case RSX_FP_OPCODE_SNE:
|
||||
case RSX_FP_OPCODE_SEQ:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_FRC:
|
||||
case RSX_FP_OPCODE_FLR:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_KIL:
|
||||
return 0;
|
||||
case RSX_FP_OPCODE_PK4:
|
||||
case RSX_FP_OPCODE_UP4:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_DDX:
|
||||
case RSX_FP_OPCODE_DDY:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_TEX:
|
||||
case RSX_FP_OPCODE_TXD:
|
||||
case RSX_FP_OPCODE_TXP:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_RCP:
|
||||
case RSX_FP_OPCODE_RSQ:
|
||||
case RSX_FP_OPCODE_EX2:
|
||||
case RSX_FP_OPCODE_LG2:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_LIT:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_LRP:
|
||||
return 3;
|
||||
case RSX_FP_OPCODE_STR:
|
||||
case RSX_FP_OPCODE_SFL:
|
||||
return 0;
|
||||
case RSX_FP_OPCODE_COS:
|
||||
case RSX_FP_OPCODE_SIN:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_PK2:
|
||||
case RSX_FP_OPCODE_UP2:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_PKB:
|
||||
case RSX_FP_OPCODE_UPB:
|
||||
case RSX_FP_OPCODE_PK16:
|
||||
case RSX_FP_OPCODE_UP16:
|
||||
case RSX_FP_OPCODE_PKG:
|
||||
case RSX_FP_OPCODE_UPG:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_DP2A:
|
||||
return 3;
|
||||
case RSX_FP_OPCODE_TXL:
|
||||
case RSX_FP_OPCODE_TXB:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_DP2:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_NRM:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_DIV:
|
||||
case RSX_FP_OPCODE_DIVSQ:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_LIF:
|
||||
return 1;
|
||||
case RSX_FP_OPCODE_FENCT:
|
||||
case RSX_FP_OPCODE_FENCB:
|
||||
case RSX_FP_OPCODE_BRK:
|
||||
case RSX_FP_OPCODE_CAL:
|
||||
case RSX_FP_OPCODE_IFE:
|
||||
case RSX_FP_OPCODE_LOOP:
|
||||
case RSX_FP_OPCODE_REP:
|
||||
case RSX_FP_OPCODE_RET:
|
||||
// Flow control. Special registers are provided for these outside the common file
|
||||
return 0;
|
||||
|
||||
// The rest are unimplemented and not encountered in real software.
|
||||
// TODO: Probe these on real PS3 and figure out what they actually do.
|
||||
case RSX_FP_OPCODE_POW:
|
||||
fmt::throw_exception("Unimplemented POW instruction."); // Unused
|
||||
case RSX_FP_OPCODE_BEM:
|
||||
case RSX_FP_OPCODE_TEXBEM:
|
||||
case RSX_FP_OPCODE_TXPBEM:
|
||||
case RSX_FP_OPCODE_BEMLUM:
|
||||
fmt::throw_exception("Unimplemented BEM class instruction"); // Unused
|
||||
case RSX_FP_OPCODE_REFL:
|
||||
return 2;
|
||||
case RSX_FP_OPCODE_TIMESWTEX:
|
||||
fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Returns a lane mask for the given operand.
|
||||
// The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel.
|
||||
u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
|
||||
{
|
||||
constexpr u32 x = 0b0001;
|
||||
constexpr u32 y = 0b0010;
|
||||
constexpr u32 z = 0b0100;
|
||||
constexpr u32 w = 0b1000;
|
||||
constexpr u32 xy = 0b0011;
|
||||
constexpr u32 xyz = 0b0111;
|
||||
constexpr u32 xyzw = 0b1111;
|
||||
|
||||
const auto decode = [&](const rsx::simple_array<u32>& masks) -> u32
|
||||
{
|
||||
return operand < masks.size()
|
||||
? masks[operand]
|
||||
: 0u;
|
||||
};
|
||||
|
||||
auto opcode = static_cast<FP_opcode>(instruction->opcode);
|
||||
if (operand >= get_operand_count(opcode))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
OPDEST d0 { .HEX = instruction->bytecode[0] };
|
||||
const u32 dst_write_mask = d0.no_dest ? 0 : d0.write_mask;
|
||||
|
||||
switch (opcode)
|
||||
{
|
||||
case RSX_FP_OPCODE_NOP:
|
||||
return 0;
|
||||
case RSX_FP_OPCODE_MOV:
|
||||
case RSX_FP_OPCODE_MUL:
|
||||
case RSX_FP_OPCODE_ADD:
|
||||
case RSX_FP_OPCODE_MAD:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_DP3:
|
||||
return xyz;
|
||||
case RSX_FP_OPCODE_DP4:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_DST:
|
||||
return decode({ y | z, y | w });
|
||||
case RSX_FP_OPCODE_MIN:
|
||||
case RSX_FP_OPCODE_MAX:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_SLT:
|
||||
case RSX_FP_OPCODE_SGE:
|
||||
case RSX_FP_OPCODE_SLE:
|
||||
case RSX_FP_OPCODE_SGT:
|
||||
case RSX_FP_OPCODE_SNE:
|
||||
case RSX_FP_OPCODE_SEQ:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_FRC:
|
||||
case RSX_FP_OPCODE_FLR:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_KIL:
|
||||
return 0;
|
||||
case RSX_FP_OPCODE_PK4:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_UP4:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_DDX:
|
||||
case RSX_FP_OPCODE_DDY:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_TEX:
|
||||
case RSX_FP_OPCODE_TXD:
|
||||
switch (prog.get_texture_dimension(d0.tex_num))
|
||||
{
|
||||
case rsx::texture_dimension_extended::texture_dimension_1d:
|
||||
return x;
|
||||
case rsx::texture_dimension_extended::texture_dimension_2d:
|
||||
return xy;
|
||||
case rsx::texture_dimension_extended::texture_dimension_3d:
|
||||
case rsx::texture_dimension_extended::texture_dimension_cubemap:
|
||||
return xyz;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
case RSX_FP_OPCODE_TXP:
|
||||
switch (prog.get_texture_dimension(d0.tex_num))
|
||||
{
|
||||
case rsx::texture_dimension_extended::texture_dimension_1d:
|
||||
return xy;
|
||||
case rsx::texture_dimension_extended::texture_dimension_2d:
|
||||
return xyz;
|
||||
case rsx::texture_dimension_extended::texture_dimension_3d:
|
||||
case rsx::texture_dimension_extended::texture_dimension_cubemap:
|
||||
return xyzw;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
case RSX_FP_OPCODE_RCP:
|
||||
case RSX_FP_OPCODE_RSQ:
|
||||
case RSX_FP_OPCODE_EX2:
|
||||
case RSX_FP_OPCODE_LG2:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_LIT:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_LRP:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_STR:
|
||||
case RSX_FP_OPCODE_SFL:
|
||||
return xyzw & dst_write_mask;
|
||||
case RSX_FP_OPCODE_COS:
|
||||
case RSX_FP_OPCODE_SIN:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_PK2:
|
||||
return xy;
|
||||
case RSX_FP_OPCODE_UP2:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_PKB:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_UPB:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_PK16:
|
||||
return xy;
|
||||
case RSX_FP_OPCODE_UP16:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_PKG:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_UPG:
|
||||
return x;
|
||||
case RSX_FP_OPCODE_DP2A:
|
||||
return decode({ xy, xy, x });
|
||||
case RSX_FP_OPCODE_TXL:
|
||||
case RSX_FP_OPCODE_TXB:
|
||||
return decode({ xy, x });
|
||||
case RSX_FP_OPCODE_REFL:
|
||||
return xyzw;
|
||||
case RSX_FP_OPCODE_DP2:
|
||||
return xy;
|
||||
case RSX_FP_OPCODE_NRM:
|
||||
return xyz;
|
||||
case RSX_FP_OPCODE_DIV:
|
||||
case RSX_FP_OPCODE_DIVSQ:
|
||||
return decode({ xyzw, x }) & dst_write_mask;
|
||||
case RSX_FP_OPCODE_LIF:
|
||||
return decode({ y | w });
|
||||
case RSX_FP_OPCODE_FENCT:
|
||||
case RSX_FP_OPCODE_FENCB:
|
||||
case RSX_FP_OPCODE_BRK:
|
||||
case RSX_FP_OPCODE_CAL:
|
||||
case RSX_FP_OPCODE_IFE:
|
||||
case RSX_FP_OPCODE_LOOP:
|
||||
case RSX_FP_OPCODE_REP:
|
||||
case RSX_FP_OPCODE_RET:
|
||||
// Flow control. Special registers are provided for these outside the common file
|
||||
return 0;
|
||||
|
||||
case RSX_FP_OPCODE_POW:
|
||||
fmt::throw_exception("Unimplemented POW instruction."); // Unused ??
|
||||
case RSX_FP_OPCODE_BEM:
|
||||
case RSX_FP_OPCODE_TEXBEM:
|
||||
case RSX_FP_OPCODE_TXPBEM:
|
||||
case RSX_FP_OPCODE_BEMLUM:
|
||||
fmt::throw_exception("Unimplemented BEM class instruction"); // Unused
|
||||
case RSX_FP_OPCODE_TIMESWTEX:
|
||||
fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Resolved vector lane mask with swizzles applied.
|
||||
u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
|
||||
{
|
||||
// Brute-force this. There's only 16 permutations.
|
||||
constexpr u32 x = 0b0001;
|
||||
constexpr u32 y = 0b0010;
|
||||
constexpr u32 z = 0b0100;
|
||||
constexpr u32 w = 0b1000;
|
||||
|
||||
const u32 lane_mask = get_src_vector_lane_mask(prog, instruction, operand);
|
||||
if (!lane_mask)
|
||||
{
|
||||
return lane_mask;
|
||||
}
|
||||
|
||||
// Now we resolve matching lanes.
|
||||
// This sequence can be drastically sped up using lookup tables but that will come later.
|
||||
std::unordered_set<u32> inputs;
|
||||
SRC_Common src { .HEX = instruction->bytecode[operand + 1] };
|
||||
|
||||
if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lane_mask & x) inputs.insert(src.swizzle_x);
|
||||
if (lane_mask & y) inputs.insert(src.swizzle_y);
|
||||
if (lane_mask & z) inputs.insert(src.swizzle_z);
|
||||
if (lane_mask & w) inputs.insert(src.swizzle_w);
|
||||
|
||||
u32 result = 0;
|
||||
if (inputs.contains(0)) result |= x;
|
||||
if (inputs.contains(1)) result |= y;
|
||||
if (inputs.contains(2)) result |= z;
|
||||
if (inputs.contains(3)) result |= w;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool is_delay_slot(const Instruction* instruction)
|
||||
{
|
||||
OPDEST dst { .HEX = instruction->bytecode[0] };
|
||||
SRC0 src0 { .HEX = instruction->bytecode[1] };
|
||||
SRC1 src1{ .HEX = instruction->bytecode[2] };
|
||||
|
||||
if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV
|
||||
dst.no_dest || // Must have a sink
|
||||
src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg
|
||||
dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self
|
||||
dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter
|
||||
dst.saturate || // Precision modifier
|
||||
(dst.prec != RSX_FP_PRECISION_REAL &&
|
||||
dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we have precision modifiers on the source
|
||||
if (src0.abs || src0.neg || src1.scale)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.mask_x && src0.swizzle_x != 0) return false;
|
||||
if (dst.mask_y && src0.swizzle_y != 1) return false;
|
||||
if (dst.mask_z && src0.swizzle_z != 2) return false;
|
||||
if (dst.mask_w && src0.swizzle_w != 3) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
|
||||
{
|
||||
SRC_Common src{ .HEX = instruction->bytecode[operand + 1] };
|
||||
if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
const u32 read_lanes = get_src_vector_lane_mask_shuffled(prog, instruction, operand);
|
||||
if (!read_lanes)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
RegisterRef ref{ .mask = read_lanes };
|
||||
Register& reg = ref.reg;
|
||||
|
||||
reg.f16 = !!src.fp16;
|
||||
reg.id = src.tmp_reg_index;
|
||||
return ref;
|
||||
}
|
||||
|
||||
RegisterRef get_dst_register(const Instruction* instruction)
|
||||
{
|
||||
OPDEST dst { .HEX = instruction->bytecode[0] };
|
||||
if (dst.no_dest)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
RegisterRef ref{ .mask = dst.write_mask };
|
||||
ref.reg.f16 = dst.fp16;
|
||||
ref.reg.id = dst.dest_reg;
|
||||
return ref;
|
||||
}
|
||||
|
||||
// Convert vector mask to file range
|
||||
rsx::simple_array<u32> get_register_file_range(const RegisterRef& reg)
|
||||
{
|
||||
if (!reg.mask)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
constexpr u32 register_file_max_len = 48 * 8; // H0 - H47, R0 - R23
|
||||
|
||||
const u32 lane_width = reg.reg.f16 ? 2 : 4;
|
||||
const u32 file_offset = reg.reg.id * lane_width * 4;
|
||||
|
||||
ensure(file_offset < register_file_max_len, "Invalid register index");
|
||||
|
||||
rsx::simple_array<u32> result{};
|
||||
auto insert_lane = [&](u32 word_offset)
|
||||
{
|
||||
for (u32 i = 0; i < lane_width; ++i)
|
||||
{
|
||||
result.push_back(file_offset + (word_offset * lane_width) + i);
|
||||
}
|
||||
};
|
||||
|
||||
if (reg.x) insert_lane(0);
|
||||
if (reg.y) insert_lane(1);
|
||||
if (reg.z) insert_lane(2);
|
||||
if (reg.w) insert_lane(3);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
111
rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h
Normal file
111
rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h
Normal file
@ -0,0 +1,111 @@
|
||||
#pragma once
|
||||
|
||||
#include "IR.h"
|
||||
#include "Emu/RSX/Common/simple_array.hpp"
|
||||
|
||||
struct RSXFragmentProgram;
|
||||
|
||||
namespace rsx::assembler
|
||||
{
|
||||
enum FP_opcode
|
||||
{
|
||||
RSX_FP_OPCODE_NOP = 0x00, // No-Operation
|
||||
RSX_FP_OPCODE_MOV = 0x01, // Move
|
||||
RSX_FP_OPCODE_MUL = 0x02, // Multiply
|
||||
RSX_FP_OPCODE_ADD = 0x03, // Add
|
||||
RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add
|
||||
RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product
|
||||
RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product
|
||||
RSX_FP_OPCODE_DST = 0x07, // Distance
|
||||
RSX_FP_OPCODE_MIN = 0x08, // Minimum
|
||||
RSX_FP_OPCODE_MAX = 0x09, // Maximum
|
||||
RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan
|
||||
RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual
|
||||
RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual
|
||||
RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan
|
||||
RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual
|
||||
RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal
|
||||
RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract)
|
||||
RSX_FP_OPCODE_FLR = 0x11, // Floor
|
||||
RSX_FP_OPCODE_KIL = 0x12, // Kill fragment
|
||||
RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values
|
||||
RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values
|
||||
RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x)
|
||||
RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y)
|
||||
RSX_FP_OPCODE_TEX = 0x17, // Texture lookup
|
||||
RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup)
|
||||
RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives)
|
||||
RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal
|
||||
RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root
|
||||
RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2
|
||||
RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2
|
||||
RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients
|
||||
RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation
|
||||
RSX_FP_OPCODE_STR = 0x20, // Set-If-True
|
||||
RSX_FP_OPCODE_SFL = 0x21, // Set-If-False
|
||||
RSX_FP_OPCODE_COS = 0x22, // Cosine
|
||||
RSX_FP_OPCODE_SIN = 0x23, // Sine
|
||||
RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats
|
||||
RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats
|
||||
RSX_FP_OPCODE_POW = 0x26, // Power
|
||||
RSX_FP_OPCODE_PKB = 0x27, // Pack bytes
|
||||
RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes
|
||||
RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits
|
||||
RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16
|
||||
RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform)
|
||||
RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation
|
||||
RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma
|
||||
RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition
|
||||
RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD
|
||||
RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias
|
||||
RSX_FP_OPCODE_TEXBEM = 0x33,
|
||||
RSX_FP_OPCODE_TXPBEM = 0x34,
|
||||
RSX_FP_OPCODE_BEMLUM = 0x35,
|
||||
RSX_FP_OPCODE_REFL = 0x36, // Reflection vector
|
||||
RSX_FP_OPCODE_TIMESWTEX = 0x37,
|
||||
RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product
|
||||
RSX_FP_OPCODE_NRM = 0x39, // Normalize
|
||||
RSX_FP_OPCODE_DIV = 0x3A, // Division
|
||||
RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root
|
||||
RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT
|
||||
RSX_FP_OPCODE_FENCT = 0x3D, // Fence T?
|
||||
RSX_FP_OPCODE_FENCB = 0x3E, // Fence B?
|
||||
RSX_FP_OPCODE_BRK = 0x40, // Break
|
||||
RSX_FP_OPCODE_CAL = 0x41, // Subroutine call
|
||||
RSX_FP_OPCODE_IFE = 0x42, // If
|
||||
RSX_FP_OPCODE_LOOP = 0x43, // Loop
|
||||
RSX_FP_OPCODE_REP = 0x44, // Repeat
|
||||
RSX_FP_OPCODE_RET = 0x45, // Return
|
||||
|
||||
|
||||
// Custom opcodes for dependency injection
|
||||
RSX_FP_OPCODE_OR16_LO = 0x46, // Performs a 16-bit OR, taking one register channel as input and overwriting low 16 bits of the output
|
||||
RSX_FP_OPCODE_OR16_HI = 0x47, // Same as the lo variant but now overwrites the high 16-bit block
|
||||
};
|
||||
|
||||
namespace FP
|
||||
{
|
||||
// Returns number of operands consumed by an instruction
|
||||
u8 get_operand_count(FP_opcode opcode);
|
||||
|
||||
// Returns a lane mask for the given operand.
|
||||
// The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel.
|
||||
u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
|
||||
|
||||
// Resolved vector lane mask with swizzles applied.
|
||||
u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
|
||||
|
||||
// Returns true on delay slot instructions.
|
||||
bool is_delay_slot(const Instruction* instruction);
|
||||
|
||||
// Generate register references
|
||||
RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
|
||||
RegisterRef get_dst_register(const Instruction* instruction);
|
||||
|
||||
// Convert vector mask to file ranges
|
||||
rsx::simple_array<u32> get_register_file_range(const RegisterRef& reg);
|
||||
|
||||
// Compile a register file annotated blob to register references
|
||||
std::vector<RegisterRef> compile_register_file(const std::array<char, 48 * 8>& file);
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,4 @@
|
||||
#include "stdafx.h"
|
||||
|
||||
#include "CFG.h"
|
||||
|
||||
#include "Emu/RSX/Common/simple_array.hpp"
|
||||
@ -75,8 +74,19 @@ namespace rsx::assembler
|
||||
{
|
||||
if (auto found = find_block_for_pc(id))
|
||||
{
|
||||
parent->insert_succ(found, edge_type);
|
||||
found->insert_pred(parent, edge_type);
|
||||
auto succ = found;
|
||||
if (found->is_of_type(EdgeType::ELSE) &&
|
||||
(edge_type == EdgeType::ENDIF || edge_type == EdgeType::ENDLOOP))
|
||||
{
|
||||
// If we landed on an "ELSE" node, link to its "ENDIF" counterpart
|
||||
auto if_parent = found->pred.front().from;
|
||||
auto endif_edge = std::find_if(if_parent->succ.begin(), if_parent->succ.end(), FN(x.type == EdgeType::ENDIF));
|
||||
ensure(endif_edge != if_parent->succ.end(), "CFG: Invalid ELSE node");
|
||||
succ = endif_edge->to;
|
||||
}
|
||||
|
||||
parent->insert_succ(succ, edge_type);
|
||||
succ->insert_pred(parent, edge_type);
|
||||
return found;
|
||||
}
|
||||
|
||||
@ -101,6 +111,43 @@ namespace rsx::assembler
|
||||
|
||||
if (found)
|
||||
{
|
||||
auto front_edge = std::find_if(bb->pred.begin(), bb->pred.end(), FN(x.type != EdgeType::ENDIF && x.type != EdgeType::ENDLOOP));
|
||||
if (front_edge != bb->pred.end())
|
||||
{
|
||||
auto parent = ensure(front_edge->from);
|
||||
switch (front_edge->type)
|
||||
{
|
||||
case EdgeType::IF:
|
||||
case EdgeType::ELSE:
|
||||
{
|
||||
// Find the merge node from the parent.
|
||||
auto succ = std::find_if(parent->succ.begin(), parent->succ.end(), FN(x.type == EdgeType::ENDIF));
|
||||
ensure(succ != parent->succ.end(), "CFG: Broken IF linkage. Please report to developers.");
|
||||
bb->insert_succ(succ->to, EdgeType::ENDIF);
|
||||
succ->to->insert_pred(bb, EdgeType::ENDIF);
|
||||
break;
|
||||
}
|
||||
case EdgeType::LOOP:
|
||||
{
|
||||
// Find the merge node from the parent
|
||||
auto succ = std::find_if(parent->succ.begin(), parent->succ.end(), FN(x.type == EdgeType::ENDLOOP));
|
||||
ensure(succ != parent->succ.end(), "CFG: Broken LOOP linkage. Please report to developers.");
|
||||
bb->insert_succ(succ->to, EdgeType::ENDLOOP);
|
||||
succ->to->insert_pred(bb, EdgeType::ENDLOOP);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// Missing an edge type?
|
||||
rsx_log.error("CFG: Unexpected block exit. Report to developers.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (bb->pred.empty())
|
||||
{
|
||||
// Impossible situation.
|
||||
rsx_log.error("CFG: Child block has no parent but has successor! Report to developers.");
|
||||
}
|
||||
|
||||
bb = *found;
|
||||
}
|
||||
|
||||
@ -113,7 +160,7 @@ namespace rsx::assembler
|
||||
src2.HEX = decoded._u32[3];
|
||||
|
||||
end = !!dst.end;
|
||||
const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6);
|
||||
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
|
||||
|
||||
if (opcode == RSX_FP_OPCODE_NOP)
|
||||
{
|
||||
@ -126,6 +173,7 @@ namespace rsx::assembler
|
||||
std::memcpy(ir_inst.bytecode, &decoded._u32[0], 16);
|
||||
ir_inst.length = 4;
|
||||
ir_inst.addr = pc * 16;
|
||||
ir_inst.opcode = opcode;
|
||||
|
||||
switch (opcode)
|
||||
{
|
||||
@ -174,6 +222,7 @@ namespace rsx::assembler
|
||||
ir_inst.length += 4;
|
||||
pc++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
pc++;
|
||||
|
||||
@ -10,6 +10,16 @@ namespace rsx::assembler
|
||||
{
|
||||
int id = 0;
|
||||
bool f16 = false;
|
||||
|
||||
bool operator == (const Register& other) const
|
||||
{
|
||||
return id == other.id && f16 == other.f16;
|
||||
}
|
||||
|
||||
std::string to_string() const
|
||||
{
|
||||
return std::string(f16 ? "H" : "R") + std::to_string(id);
|
||||
}
|
||||
};
|
||||
|
||||
struct RegisterRef
|
||||
@ -19,7 +29,7 @@ namespace rsx::assembler
|
||||
// Vector information
|
||||
union
|
||||
{
|
||||
u32 mask;
|
||||
u32 mask = 0;
|
||||
|
||||
struct
|
||||
{
|
||||
@ -29,6 +39,16 @@ namespace rsx::assembler
|
||||
bool w : 1;
|
||||
};
|
||||
};
|
||||
|
||||
operator bool() const
|
||||
{
|
||||
return !!mask;
|
||||
}
|
||||
|
||||
bool operator == (const RegisterRef& other) const
|
||||
{
|
||||
return reg == other.reg && mask == other.mask;
|
||||
}
|
||||
};
|
||||
|
||||
struct Instruction
|
||||
@ -71,6 +91,7 @@ namespace rsx::assembler
|
||||
struct BasicBlock
|
||||
{
|
||||
u32 id = 0;
|
||||
|
||||
std::vector<Instruction> instructions; // Program instructions for the RSX processor
|
||||
std::vector<FlowEdge> succ; // Forward edges. Sorted closest first.
|
||||
std::vector<FlowEdge> pred; // Back edges. Sorted closest first.
|
||||
@ -78,6 +99,9 @@ namespace rsx::assembler
|
||||
std::vector<Instruction> prologue; // Prologue, created by passes
|
||||
std::vector<Instruction> epilogue; // Epilogue, created by passes
|
||||
|
||||
std::vector<RegisterRef> input_list; // Register inputs.
|
||||
std::vector<RegisterRef> clobber_list; // Clobbered outputs
|
||||
|
||||
FlowEdge* insert_succ(BasicBlock* b, EdgeType type = EdgeType::NONE)
|
||||
{
|
||||
FlowEdge e{ .type = type, .from = this, .to = b };
|
||||
@ -91,5 +115,25 @@ namespace rsx::assembler
|
||||
pred.push_back(e);
|
||||
return &pred.back();
|
||||
}
|
||||
|
||||
bool is_of_type(EdgeType type) const
|
||||
{
|
||||
return pred.size() == 1 &&
|
||||
pred.front().type == type;
|
||||
}
|
||||
|
||||
bool has_sibling_of_type(EdgeType type) const
|
||||
{
|
||||
if (pred.size() != 1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
auto source_node = pred.front().from;
|
||||
return std::find_if(
|
||||
source_node->succ.begin(),
|
||||
source_node->succ.end(),
|
||||
FN(x.type == type)) != source_node->succ.end();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -0,0 +1,230 @@
|
||||
#include "stdafx.h"
|
||||
|
||||
#include "RegisterAnnotationPass.h"
|
||||
#include "Emu/RSX/Program/Assembler/FPOpcodes.h"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
#include <span>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace rsx::assembler::FP
|
||||
{
|
||||
static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers
|
||||
static constexpr char content_unknown = 0;
|
||||
static constexpr char content_float32 = 'R';
|
||||
static constexpr char content_float16 = 'H';
|
||||
static constexpr char content_dual = 'D';
|
||||
|
||||
bool is_delay_slot(const Instruction& instruction)
|
||||
{
|
||||
const OPDEST dst{ .HEX = instruction.bytecode[0] };
|
||||
const SRC0 src0{ .HEX = instruction.bytecode[1] };
|
||||
const SRC1 src1{ .HEX = instruction.bytecode[2] };
|
||||
|
||||
if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV
|
||||
dst.no_dest || // Must have a sink
|
||||
src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg
|
||||
dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self
|
||||
dst.fp16 != src0.fp16 || // Must really be the same register
|
||||
src0.abs || src0.neg ||
|
||||
dst.saturate) // Precision modifier
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (dst.prec)
|
||||
{
|
||||
case RSX_FP_PRECISION_REAL:
|
||||
case RSX_FP_PRECISION_UNKNOWN:
|
||||
break;
|
||||
case RSX_FP_PRECISION_HALF:
|
||||
if (!src0.fp16) return false;
|
||||
break;
|
||||
case RSX_FP_PRECISION_FIXED12:
|
||||
case RSX_FP_PRECISION_FIXED9:
|
||||
case RSX_FP_PRECISION_SATURATE:
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we have precision modifiers on the source
|
||||
if (src0.abs || src0.neg || src1.scale)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.mask_x && src0.swizzle_x != 0) return false;
|
||||
if (dst.mask_y && src0.swizzle_y != 1) return false;
|
||||
if (dst.mask_z && src0.swizzle_z != 2) return false;
|
||||
if (dst.mask_w && src0.swizzle_w != 3) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<RegisterRef> compile_register_file(const std::array<char, 48 * 8>& file)
|
||||
{
|
||||
std::vector<RegisterRef> results;
|
||||
|
||||
// F16 register processing
|
||||
for (int reg16 = 0; reg16 < 48; ++reg16)
|
||||
{
|
||||
const u32 offset = reg16 * 8;
|
||||
auto word = *reinterpret_cast<const u64*>(&file[offset]);
|
||||
|
||||
if (!word) [[ likely ]]
|
||||
{
|
||||
// Trivial rejection, very commonly hit.
|
||||
continue;
|
||||
}
|
||||
|
||||
RegisterRef ref{ .reg {.id = reg16, .f16 = true } };
|
||||
ref.x = (file[offset] == content_dual || file[offset] == content_float16);
|
||||
ref.y = (file[offset + 2] == content_dual || file[offset + 2] == content_float16);
|
||||
ref.z = (file[offset + 4] == content_dual || file[offset + 4] == content_float16);
|
||||
ref.w = (file[offset + 6] == content_dual || file[offset + 6] == content_float16);
|
||||
|
||||
if (ref)
|
||||
{
|
||||
results.push_back(std::move(ref));
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to check a span for 32-bit access
|
||||
auto match_any_32 = [](const std::span<const char> lanes)
|
||||
{
|
||||
return std::any_of(lanes.begin(), lanes.end(), FN(x == content_dual || x == content_float32));
|
||||
};
|
||||
|
||||
// F32 register processing
|
||||
for (int reg32 = 0; reg32 < 24; ++reg32)
|
||||
{
|
||||
const u32 offset = reg32 * 16;
|
||||
auto word0 = *reinterpret_cast<const u64*>(&file[offset]);
|
||||
auto word1 = *reinterpret_cast<const u64*>(&file[offset + 8]);
|
||||
|
||||
if (!word0 && !word1) [[ likely ]]
|
||||
{
|
||||
// Trivial rejection, very commonly hit.
|
||||
continue;
|
||||
}
|
||||
|
||||
RegisterRef ref{ .reg {.id = reg32, .f16 = false } };
|
||||
if (word0)
|
||||
{
|
||||
ref.x = match_any_32({ &file[offset], 4 });
|
||||
ref.y = match_any_32({ &file[offset + 4], 4 });
|
||||
}
|
||||
|
||||
if (word1)
|
||||
{
|
||||
ref.z = match_any_32({ &file[offset + 8], 4 });
|
||||
ref.w = match_any_32({ &file[offset + 12], 4 });
|
||||
}
|
||||
|
||||
if (ref)
|
||||
{
|
||||
results.push_back(std::move(ref));
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// Decay instructions into register references
|
||||
void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog, bool skip_delay_slots)
|
||||
{
|
||||
for (auto& instruction : block->instructions)
|
||||
{
|
||||
if (skip_delay_slots && is_delay_slot(instruction))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
const u32 operand_count = get_operand_count(static_cast<FP_opcode>(instruction.opcode));
|
||||
for (u32 i = 0; i < operand_count; i++)
|
||||
{
|
||||
RegisterRef reg = get_src_register(prog, &instruction, i);
|
||||
if (!reg.mask)
|
||||
{
|
||||
// Likely a literal constant
|
||||
continue;
|
||||
}
|
||||
|
||||
instruction.srcs.push_back(std::move(reg));
|
||||
}
|
||||
|
||||
RegisterRef dst = get_dst_register(&instruction);
|
||||
if (dst)
|
||||
{
|
||||
instruction.dsts.push_back(std::move(dst));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Annotate each block with input and output lanes (read and clobber list)
|
||||
void annotate_block_io(BasicBlock* block)
|
||||
{
|
||||
alignas(16) std::array<char, register_file_length> output_register_file;
|
||||
alignas(16) std::array<char, register_file_length> input_register_file; // We'll eventually replace with a bitfield mask, but for ease of debugging, we use char for now
|
||||
|
||||
std::memset(output_register_file.data(), content_unknown, register_file_length);
|
||||
std::memset(input_register_file.data(), content_unknown, register_file_length);
|
||||
|
||||
for (const auto& instruction : block->instructions)
|
||||
{
|
||||
for (const auto& src : instruction.srcs)
|
||||
{
|
||||
const auto read_bytes = get_register_file_range(src);
|
||||
const char expected_type = src.reg.f16 ? content_float16 : content_float32;
|
||||
for (const auto& index : read_bytes)
|
||||
{
|
||||
if (output_register_file[index] != content_unknown)
|
||||
{
|
||||
// Something already wrote to this lane
|
||||
continue;
|
||||
}
|
||||
|
||||
if (input_register_file[index] == expected_type)
|
||||
{
|
||||
// We already know about this input
|
||||
continue;
|
||||
}
|
||||
|
||||
if (input_register_file[index] == 0)
|
||||
{
|
||||
// Not known, tag as input
|
||||
input_register_file[index] = expected_type;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collision on the lane
|
||||
input_register_file[index] = content_dual;
|
||||
}
|
||||
}
|
||||
|
||||
if (!instruction.dsts.empty())
|
||||
{
|
||||
const auto& dst = instruction.dsts.front();
|
||||
const auto write_bytes = get_register_file_range(dst);
|
||||
const char expected_type = dst.reg.f16 ? content_float16 : content_float32;
|
||||
|
||||
for (const auto& index : write_bytes)
|
||||
{
|
||||
output_register_file[index] = expected_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compile the input and output refs into register references
|
||||
block->clobber_list = compile_register_file(output_register_file);
|
||||
block->input_list = compile_register_file(input_register_file);
|
||||
}
|
||||
|
||||
void RegisterAnnotationPass::run(FlowGraph& graph)
|
||||
{
|
||||
for (auto& block : graph.blocks)
|
||||
{
|
||||
annotate_instructions(&block, m_prog, m_config.skip_delay_slots);
|
||||
annotate_block_io(&block);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include "../../CFG.h"
|
||||
|
||||
struct RSXFragmentProgram;
|
||||
|
||||
namespace rsx::assembler::FP
|
||||
{
|
||||
struct RegisterAnnotationPassOptions
|
||||
{
|
||||
bool skip_delay_slots = false; // When enabled, detect delay slots and ignore annotating them.
|
||||
};
|
||||
|
||||
// The annotation pass annotates each basic block with 2 pieces of information:
|
||||
// 1. The "input" register list for a block.
|
||||
// 2. The "output" register list for a block (clobber list).
|
||||
// The information can be used by other passes to set up prologue/epilogue on each block.
|
||||
// The pass also populates register reference members of each instruction, such as the input and output lanes.
|
||||
class RegisterAnnotationPass : public CFGPass
|
||||
{
|
||||
public:
|
||||
RegisterAnnotationPass(
|
||||
const RSXFragmentProgram& prog,
|
||||
const RegisterAnnotationPassOptions& options = {})
|
||||
: m_prog(prog), m_config(options)
|
||||
{}
|
||||
|
||||
void run(FlowGraph& graph) override;
|
||||
|
||||
private:
|
||||
const RSXFragmentProgram& m_prog;
|
||||
RegisterAnnotationPassOptions m_config;
|
||||
};
|
||||
}
|
||||
@ -0,0 +1,490 @@
|
||||
#include "stdafx.h"
|
||||
|
||||
#include "RegisterDependencyPass.h"
|
||||
#include "Emu/RSX/Program/Assembler/FPOpcodes.h"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace rsx::assembler::FP
|
||||
{
|
||||
static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers
|
||||
static constexpr char content_unknown = 0;
|
||||
static constexpr char content_float32 = 'R';
|
||||
static constexpr char content_float16 = 'H';
|
||||
static constexpr char content_dual = 'D';
|
||||
|
||||
using register_file_t = std::array<char, register_file_length>;
|
||||
|
||||
struct DependencyPassContext
|
||||
{
|
||||
std::unordered_map<BasicBlock*, register_file_t> exec_register_map;
|
||||
std::unordered_map<BasicBlock*, register_file_t> sync_register_map;
|
||||
};
|
||||
|
||||
enum Register32BarrierFlags
|
||||
{
|
||||
NONE = 0,
|
||||
OR_WORD0 = 1,
|
||||
OR_WORD1 = 2,
|
||||
DEFAULT = OR_WORD0 | OR_WORD1
|
||||
};
|
||||
|
||||
struct RegisterBarrier32
|
||||
{
|
||||
RegisterRef ref;
|
||||
u32 flags[4];
|
||||
};
|
||||
|
||||
std::vector<RegisterRef> decode_lanes16(const std::unordered_set<u32>& lanes)
|
||||
{
|
||||
std::vector<RegisterRef> result;
|
||||
|
||||
for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 8)
|
||||
{
|
||||
// Each register has 4 16-bit lanes
|
||||
u32 mask = 0;
|
||||
if (lanes.contains(file_offset + 0)) mask |= (1u << 0);
|
||||
if (lanes.contains(file_offset + 2)) mask |= (1u << 1);
|
||||
if (lanes.contains(file_offset + 4)) mask |= (1u << 2);
|
||||
if (lanes.contains(file_offset + 6)) mask |= (1u << 3);
|
||||
|
||||
if (mask == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
RegisterRef ref{ .reg{.id = static_cast<int>(index), .f16 = true } };
|
||||
ref.mask = mask;
|
||||
result.push_back(std::move(ref));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<RegisterBarrier32> decode_lanes32(const std::unordered_set<u32>& lanes)
|
||||
{
|
||||
std::vector<RegisterBarrier32> result;
|
||||
|
||||
for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 16)
|
||||
{
|
||||
// Each register has 8 16-bit lanes
|
||||
RegisterBarrier32 barrier{};
|
||||
auto& ref = barrier.ref;
|
||||
|
||||
for (u32 lane = 0; lane < 16; lane += 2)
|
||||
{
|
||||
if (!lanes.contains(file_offset + lane))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
const u32 ch = (lane / 4);
|
||||
const u32 flags = (lane & 3)
|
||||
? Register32BarrierFlags::OR_WORD1
|
||||
: Register32BarrierFlags::OR_WORD0;
|
||||
|
||||
ref.mask |= (1u << ch);
|
||||
barrier.flags[ch] |= flags;
|
||||
}
|
||||
|
||||
if (ref.mask == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
ref.reg = {.id = static_cast<int>(index), .f16 = false };
|
||||
result.push_back(std::move(barrier));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<Instruction> build_barrier32(const RegisterBarrier32& barrier)
|
||||
{
|
||||
// Upto 4 instructions are needed per 32-bit register
|
||||
// R0.x = packHalf2x16(H0.xy)
|
||||
// R0.y = packHalf2x16(H0.zw);
|
||||
// R0.z = packHalf2x16(H1.xy);
|
||||
// R0.w = packHalf2x16(H1.zw);
|
||||
|
||||
std::vector<Instruction> result;
|
||||
|
||||
for (u32 mask = barrier.ref.mask, ch = 0; mask > 0; mask >>= 1, ++ch)
|
||||
{
|
||||
if (!(mask & 1))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto& reg = barrier.ref.reg;
|
||||
const auto reg_id = reg.id;
|
||||
|
||||
Instruction instruction{};
|
||||
OPDEST dst{};
|
||||
dst.prec = RSX_FP_PRECISION_REAL;
|
||||
dst.fp16 = 0;
|
||||
dst.dest_reg = reg_id;
|
||||
dst.write_mask = (1u << ch);
|
||||
|
||||
const u32 src_reg_id = (ch / 2) + (reg_id * 2);
|
||||
const bool is_word0 = !(ch & 1); // Only even
|
||||
|
||||
SRC0 src0{};
|
||||
if (is_word0)
|
||||
{
|
||||
src0.swizzle_x = 0;
|
||||
src0.swizzle_y = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
src0.swizzle_x = 2;
|
||||
src0.swizzle_y = 3;
|
||||
}
|
||||
|
||||
src0.swizzle_z = 2;
|
||||
src0.swizzle_w = 3;
|
||||
src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
|
||||
src0.tmp_reg_index = src_reg_id;
|
||||
src0.fp16 = 1;
|
||||
|
||||
// Prepare source 1 to match the output in case we need to encode an OR
|
||||
SRC1 src1{};
|
||||
src1.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
|
||||
src1.tmp_reg_index = reg_id;
|
||||
src1.swizzle_x = ch;
|
||||
src1.swizzle_y = ch;
|
||||
src1.swizzle_z = ch;
|
||||
src1.swizzle_w = ch;
|
||||
|
||||
u32 opcode = 0;
|
||||
switch (barrier.flags[ch])
|
||||
{
|
||||
case Register32BarrierFlags::DEFAULT:
|
||||
opcode = RSX_FP_OPCODE_PK2;
|
||||
break;
|
||||
case Register32BarrierFlags::OR_WORD0:
|
||||
opcode = RSX_FP_OPCODE_OR16_LO;
|
||||
// Swap inputs
|
||||
std::swap(src0.HEX, src1.HEX);
|
||||
break;
|
||||
case Register32BarrierFlags::OR_WORD1:
|
||||
opcode = RSX_FP_OPCODE_OR16_HI;
|
||||
src0.swizzle_x = src0.swizzle_y;
|
||||
std::swap(src0.HEX, src1.HEX);
|
||||
break;
|
||||
case Register32BarrierFlags::NONE:
|
||||
default:
|
||||
fmt::throw_exception("Unexpected lane barrier with no mask.");
|
||||
}
|
||||
|
||||
dst.opcode = opcode & 0x3F;
|
||||
src1.opcode_hi = (opcode > 0x3F) ? 1 : 0;
|
||||
src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1;
|
||||
|
||||
instruction.opcode = opcode;
|
||||
instruction.bytecode[0] = dst.HEX;
|
||||
instruction.bytecode[1] = src0.HEX;
|
||||
instruction.bytecode[2] = src1.HEX;
|
||||
|
||||
Register src_reg{ .id = static_cast<int>(src_reg_id), .f16 = true };
|
||||
instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF });
|
||||
instruction.dsts.push_back({ .reg{ .id = reg_id, .f16 = false }, .mask = (1u << ch) });
|
||||
result.push_back(std::move(instruction));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<Instruction> build_barrier16(const RegisterRef& reg)
|
||||
{
|
||||
// H0.xy = unpackHalf2x16(R0.x)
|
||||
// H0.zw = unpackHalf2x16(R0.y)
|
||||
// H1.xy = unpackHalf2x16(R0.z)
|
||||
// H1.zw = unpackHalf2x16(R0.w)
|
||||
|
||||
std::vector<Instruction> result;
|
||||
|
||||
for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch)
|
||||
{
|
||||
if (!(mask & 1))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
Instruction instruction{};
|
||||
OPDEST dst{};
|
||||
dst.opcode = RSX_FP_OPCODE_UP2;
|
||||
dst.prec = RSX_FP_PRECISION_HALF;
|
||||
dst.fp16 = 1;
|
||||
dst.dest_reg = reg.reg.id;
|
||||
dst.write_mask = 1u << ch;
|
||||
|
||||
const u32 src_reg_id = reg.reg.id / 2;
|
||||
const bool is_odd_reg = !!(reg.reg.id & 1);
|
||||
const bool is_odd_ch = !!(ch & 1);
|
||||
const bool is_word0 = ch < 2;
|
||||
|
||||
// If we're an even channel, we should also write the next channel (y/w)
|
||||
if (!is_odd_ch && (mask & 2))
|
||||
{
|
||||
mask >>= 1;
|
||||
++ch;
|
||||
dst.write_mask |= (1u << ch);
|
||||
}
|
||||
|
||||
SRC0 src0{};
|
||||
src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1;
|
||||
|
||||
if (is_word0)
|
||||
{
|
||||
src0.swizzle_x = is_odd_reg ? 2 : 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
src0.swizzle_x = is_odd_reg ? 3 : 1;
|
||||
}
|
||||
|
||||
src0.swizzle_y = 1;
|
||||
src0.swizzle_z = 2;
|
||||
src0.swizzle_w = 3;
|
||||
src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
|
||||
src0.tmp_reg_index = src_reg_id;
|
||||
|
||||
instruction.opcode = dst.opcode;
|
||||
instruction.bytecode[0] = dst.HEX;
|
||||
instruction.bytecode[1] = src0.HEX;
|
||||
|
||||
Register src_reg{ .id = static_cast<int>(src_reg_id), .f16 = true };
|
||||
instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF });
|
||||
instruction.dsts.push_back({ .reg{.id = reg.reg.id, .f16 = false }, .mask = dst.write_mask });
|
||||
result.push_back(std::move(instruction));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<Instruction> resolve_dependencies(const std::unordered_set<u32>& lanes, bool f16)
|
||||
{
|
||||
std::vector<Instruction> result;
|
||||
|
||||
if (f16)
|
||||
{
|
||||
const auto regs = decode_lanes16(lanes);
|
||||
for (const auto& ref : regs)
|
||||
{
|
||||
auto instructions = build_barrier16(ref);
|
||||
result.insert(result.end(), instructions.begin(), instructions.end());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const auto barriers = decode_lanes32(lanes);
|
||||
for (const auto& barrier : barriers)
|
||||
{
|
||||
auto instructions = build_barrier32(barrier);
|
||||
result.insert(result.end(), std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end()));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void insert_dependency_barriers(DependencyPassContext& ctx, BasicBlock* block)
|
||||
{
|
||||
register_file_t& register_file = ctx.exec_register_map[block];
|
||||
std::memset(register_file.data(), content_unknown, register_file_length);
|
||||
|
||||
std::unordered_set<u32> barrier16;
|
||||
std::unordered_set<u32> barrier32;
|
||||
|
||||
// This subpass does not care about the prologue and epilogue and assumes each block is unique.
|
||||
for (auto it = block->instructions.begin(); it != block->instructions.end(); ++it)
|
||||
{
|
||||
auto& inst = *it;
|
||||
|
||||
barrier16.clear();
|
||||
barrier32.clear();
|
||||
|
||||
for (const auto& src : inst.srcs)
|
||||
{
|
||||
const auto read_bytes = get_register_file_range(src);
|
||||
const char expected_type = src.reg.f16 ? content_float16 : content_float32;
|
||||
for (const auto& index : read_bytes)
|
||||
{
|
||||
if (register_file[index] == content_unknown)
|
||||
{
|
||||
// Skip input
|
||||
continue;
|
||||
}
|
||||
|
||||
if (register_file[index] == expected_type || register_file[index] == content_dual)
|
||||
{
|
||||
// Match - nothing to do
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collision on the lane
|
||||
register_file[index] = content_dual;
|
||||
(src.reg.f16 ? barrier16 : barrier32).insert(index);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& dst : inst.dsts)
|
||||
{
|
||||
const auto write_bytes = get_register_file_range(dst);
|
||||
const char expected_type = dst.reg.f16 ? content_float16 : content_float32;
|
||||
|
||||
for (const auto& index : write_bytes)
|
||||
{
|
||||
register_file[index] = expected_type;
|
||||
}
|
||||
}
|
||||
|
||||
// We need to inject some barrier instructions
|
||||
if (!barrier16.empty())
|
||||
{
|
||||
auto barrier16_in = decode_lanes16(barrier16);
|
||||
std::vector<Instruction> instructions;
|
||||
instructions.reserve(barrier16_in.size());
|
||||
|
||||
for (const auto& reg : barrier16_in)
|
||||
{
|
||||
auto barrier = build_barrier16(reg);
|
||||
instructions.insert(instructions.end(), std::make_move_iterator(barrier.begin()), std::make_move_iterator(barrier.end()));
|
||||
}
|
||||
|
||||
it = block->instructions.insert(it, std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end()));
|
||||
std::advance(it, instructions.size());
|
||||
}
|
||||
|
||||
if (!barrier32.empty())
|
||||
{
|
||||
auto barrier32_in = decode_lanes32(barrier32);
|
||||
std::vector<Instruction> instructions;
|
||||
instructions.reserve(barrier32_in.size());
|
||||
|
||||
for (const auto& reg : barrier32_in)
|
||||
{
|
||||
auto barrier = build_barrier32(reg);
|
||||
instructions.insert(instructions.end(), std::make_move_iterator(barrier.begin()), std::make_move_iterator(barrier.end()));
|
||||
}
|
||||
|
||||
it = block->instructions.insert(it, std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end()));
|
||||
std::advance(it, instructions.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void insert_block_register_dependency(DependencyPassContext& ctx, BasicBlock* block, const std::unordered_set<u32>& lanes, bool f16)
|
||||
{
|
||||
std::unordered_set<u32> clobbered_lanes;
|
||||
std::unordered_set<u32> lanes_to_search;
|
||||
|
||||
for (auto& back_edge : block->pred)
|
||||
{
|
||||
auto target = back_edge.from;
|
||||
|
||||
// Quick check - if we've reached an IF-ELSE anchor, don't traverse upwards.
|
||||
// The IF and ELSE edges are already a complete set and will bre processed before this node.
|
||||
if (back_edge.type == EdgeType::ENDIF &&
|
||||
&back_edge == &block->pred.back() &&
|
||||
target->succ.size() == 3 &&
|
||||
target->succ[1].type == EdgeType::ELSE &&
|
||||
target->succ[2].type == EdgeType::ENDIF &&
|
||||
target->succ[2].to == block)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Did this target even clobber our register?
|
||||
ensure(ctx.exec_register_map.find(target) != ctx.exec_register_map.end(), "Block has not been pre-processed");
|
||||
|
||||
if (ctx.sync_register_map.find(target) == ctx.sync_register_map.end())
|
||||
{
|
||||
auto& blob = ctx.sync_register_map[target];
|
||||
std::memset(blob.data(), content_unknown, register_file_length);
|
||||
}
|
||||
|
||||
auto& sync_register_file = ctx.sync_register_map[target];
|
||||
const auto& exec_register_file = ctx.exec_register_map[target];
|
||||
const auto clobber_type = f16 ? content_float32 : content_float16;
|
||||
|
||||
lanes_to_search.clear();
|
||||
clobbered_lanes.clear();
|
||||
|
||||
for (auto& lane : lanes)
|
||||
{
|
||||
if (exec_register_file[lane] == clobber_type &&
|
||||
sync_register_file[lane] == content_unknown)
|
||||
{
|
||||
clobbered_lanes.insert(lane);
|
||||
sync_register_file[lane] = content_dual;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (exec_register_file[lane] == content_unknown)
|
||||
{
|
||||
lanes_to_search.insert(lane);
|
||||
}
|
||||
}
|
||||
|
||||
if (!clobbered_lanes.empty())
|
||||
{
|
||||
auto instructions = resolve_dependencies(clobbered_lanes, f16);
|
||||
target->epilogue.insert(target->epilogue.end(), std::make_move_iterator(instructions.begin()), std::make_move_iterator(instructions.end()));
|
||||
}
|
||||
|
||||
if (lanes_to_search.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// We have some missing lanes. Search upwards
|
||||
if (!target->pred.empty())
|
||||
{
|
||||
// We only need to search the last predecessor which is the true "root" of the branch
|
||||
insert_block_register_dependency(ctx, target, lanes_to_search, f16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void insert_block_dependencies(DependencyPassContext& ctx, BasicBlock* block)
|
||||
{
|
||||
auto range_from_ref = [](const RegisterRef& ref)
|
||||
{
|
||||
const auto range = get_register_file_range(ref);
|
||||
|
||||
std::unordered_set<u32> result;
|
||||
for (const auto& value : range)
|
||||
{
|
||||
result.insert(value);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
for (auto& ref : block->input_list)
|
||||
{
|
||||
const auto range = range_from_ref(ref);
|
||||
insert_block_register_dependency(ctx, block, range, ref.reg.f16);
|
||||
}
|
||||
}
|
||||
|
||||
void RegisterDependencyPass::run(FlowGraph& graph)
|
||||
{
|
||||
DependencyPassContext ctx{};
|
||||
|
||||
// First, run intra-block dependency
|
||||
for (auto& block : graph.blocks)
|
||||
{
|
||||
insert_dependency_barriers(ctx, &block);
|
||||
}
|
||||
|
||||
// Then, create prologue/epilogue instructions
|
||||
// Traverse the list in reverse order to bubble up dependencies correctly.
|
||||
for (auto it = graph.blocks.rbegin(); it != graph.blocks.rend(); ++it)
|
||||
{
|
||||
insert_block_dependencies(ctx, &(*it));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include "../../CFG.h"
|
||||
|
||||
namespace rsx::assembler::FP
|
||||
{
|
||||
// The register dependency pass identifies data hazards for each basic block and injects barrier instructions.
|
||||
// Real PS3 does not have explicit barriers, but does instead often use delay slots or fence instructions to stall until a specific hardware unit clears the fence to advance.
|
||||
// For decompiled shaders, we have the problem that aliasing is not real and is instead simulated. We do not have access to unions on the GPU without really nasty tricks.
|
||||
class RegisterDependencyPass : public CFGPass
|
||||
{
|
||||
public:
|
||||
void run(FlowGraph& graph) override;
|
||||
};
|
||||
}
|
||||
@ -273,7 +273,7 @@ void CgBinaryDisasm::TaskFP()
|
||||
src2.HEX = GetData(data[3]);
|
||||
|
||||
m_step = 4 * sizeof(u32);
|
||||
m_opcode = dst.opcode | (src1.opcode_is_branch << 6);
|
||||
m_opcode = dst.opcode | (src1.opcode_hi << 6);
|
||||
|
||||
auto SCT = [&]()
|
||||
{
|
||||
|
||||
@ -3,12 +3,19 @@
|
||||
#include "FragmentProgramDecompiler.h"
|
||||
#include "ProgramStateCache.h"
|
||||
|
||||
#include "Assembler/Passes/FP/RegisterAnnotationPass.h"
|
||||
#include "Assembler/Passes/FP/RegisterDependencyPass.h"
|
||||
|
||||
#include "Emu/system_config.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
namespace fragment_program
|
||||
{
|
||||
using namespace rsx::assembler;
|
||||
|
||||
static const std::string reg_table[] =
|
||||
{
|
||||
"wpos",
|
||||
@ -17,10 +24,33 @@ namespace rsx
|
||||
"tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9",
|
||||
"ssa"
|
||||
};
|
||||
|
||||
static const std::vector<RegisterRef> s_fp32_output_set =
|
||||
{
|
||||
{.reg {.id = 0, .f16 = false }, .mask = 0xf },
|
||||
{.reg {.id = 2, .f16 = false }, .mask = 0xf },
|
||||
{.reg {.id = 3, .f16 = false }, .mask = 0xf },
|
||||
{.reg {.id = 4, .f16 = false }, .mask = 0xf },
|
||||
};
|
||||
|
||||
static const std::vector<RegisterRef> s_fp16_output_set =
|
||||
{
|
||||
{.reg {.id = 0, .f16 = true }, .mask = 0xf },
|
||||
{.reg {.id = 4, .f16 = true }, .mask = 0xf },
|
||||
{.reg {.id = 6, .f16 = true }, .mask = 0xf },
|
||||
{.reg {.id = 8, .f16 = true }, .mask = 0xf },
|
||||
};
|
||||
|
||||
static const RegisterRef s_z_export_reg =
|
||||
{
|
||||
.reg {.id = 1, .f16 = false },
|
||||
.mask = (1u << 2)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
using namespace rsx::fragment_program;
|
||||
using namespace rsx::assembler;
|
||||
|
||||
// SIMD vector lanes
|
||||
enum VectorLane : u8
|
||||
@ -31,6 +61,26 @@ enum VectorLane : u8
|
||||
W = 3,
|
||||
};
|
||||
|
||||
std::vector<RegisterRef> get_fragment_program_output_set(u32 ctrl, u32 mrt_count)
|
||||
{
|
||||
std::vector<RegisterRef> result;
|
||||
if (mrt_count > 0)
|
||||
{
|
||||
result = (ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)
|
||||
? s_fp32_output_set
|
||||
: s_fp16_output_set;
|
||||
|
||||
result.resize(mrt_count);
|
||||
}
|
||||
|
||||
if (ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
|
||||
{
|
||||
result.push_back(s_z_export_reg);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size)
|
||||
: m_size(size)
|
||||
, m_prog(prog)
|
||||
@ -151,8 +201,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
|
||||
}
|
||||
|
||||
const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg;
|
||||
ensure(reg_index < temp_registers.size());
|
||||
|
||||
if (dst.opcode == RSX_FP_OPCODE_MOV &&
|
||||
src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP &&
|
||||
src0.tmp_reg_index == reg_index)
|
||||
@ -165,8 +213,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w);
|
||||
}
|
||||
|
||||
void FragmentProgramDecompiler::AddFlowOp(const std::string& code)
|
||||
@ -522,26 +568,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
|
||||
switch (src.reg_type)
|
||||
{
|
||||
case RSX_FP_REGISTER_TYPE_TEMP:
|
||||
|
||||
if (!src.fp16)
|
||||
{
|
||||
if (dst.opcode == RSX_FP_OPCODE_UP16 ||
|
||||
dst.opcode == RSX_FP_OPCODE_UP2 ||
|
||||
dst.opcode == RSX_FP_OPCODE_UP4 ||
|
||||
dst.opcode == RSX_FP_OPCODE_UPB ||
|
||||
dst.opcode == RSX_FP_OPCODE_UPG)
|
||||
{
|
||||
auto ® = temp_registers[src.tmp_reg_index];
|
||||
if (reg.requires_gather(src.swizzle_x))
|
||||
{
|
||||
properties.has_gather_op = true;
|
||||
AddReg(src.tmp_reg_index, src.fp16);
|
||||
ret = getFloatTypeName(4) + reg.gather_r();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (precision_modifier == RSX_FP_PRECISION_HALF)
|
||||
if (src.fp16 && precision_modifier == RSX_FP_PRECISION_HALF)
|
||||
{
|
||||
// clamp16() is not a cheap operation when emulated; avoid at all costs
|
||||
precision_modifier = RSX_FP_PRECISION_REAL;
|
||||
@ -762,7 +789,6 @@ std::string FragmentProgramDecompiler::BuildCode()
|
||||
const std::string float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
|
||||
const std::string init_value = float4_type + "(0.)";
|
||||
std::array<std::string, 4> output_register_names;
|
||||
std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
|
||||
|
||||
// Holder for any "cleanup" before exiting main
|
||||
std::stringstream main_epilogue;
|
||||
@ -772,17 +798,6 @@ std::string FragmentProgramDecompiler::BuildCode()
|
||||
{
|
||||
// Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!!
|
||||
m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value);
|
||||
|
||||
auto& r1 = temp_registers[1];
|
||||
if (r1.requires_gather(VectorLane::Z))
|
||||
{
|
||||
// r1.zw was not written to
|
||||
properties.has_gather_op = true;
|
||||
main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n";
|
||||
|
||||
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
|
||||
rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered.");
|
||||
}
|
||||
}
|
||||
|
||||
// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
|
||||
@ -810,33 +825,6 @@ std::string FragmentProgramDecompiler::BuildCode()
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto block_index = ouput_register_indices[n];
|
||||
auto& r = temp_registers[block_index];
|
||||
|
||||
if (fp16_out)
|
||||
{
|
||||
// Check if we need a split/extract op
|
||||
if (r.requires_split(0))
|
||||
{
|
||||
main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n";
|
||||
|
||||
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
|
||||
rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!r.requires_gather128())
|
||||
{
|
||||
// Nothing to do
|
||||
continue;
|
||||
}
|
||||
|
||||
// We need to gather the data from existing registers
|
||||
main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n";
|
||||
properties.has_gather_op = true;
|
||||
|
||||
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
|
||||
rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name);
|
||||
}
|
||||
@ -1024,28 +1012,6 @@ std::string FragmentProgramDecompiler::BuildCode()
|
||||
OS << Format(divsq_func);
|
||||
}
|
||||
|
||||
// Declare register gather/merge if needed
|
||||
if (properties.has_gather_op)
|
||||
{
|
||||
std::string float2 = getFloatTypeName(2);
|
||||
|
||||
OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n";
|
||||
OS << "{\n";
|
||||
OS << " float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n";
|
||||
OS << " float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n";
|
||||
OS << " float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n";
|
||||
OS << " float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n";
|
||||
OS << " return " << float4 << "(x, y, z, w);\n";
|
||||
OS << "}\n\n";
|
||||
|
||||
OS << float2 << " gather(" << float4 << " _h)\n";
|
||||
OS << "{\n";
|
||||
OS << " float x = uintBitsToFloat(packHalf2x16(_h.xy));\n";
|
||||
OS << " float y = uintBitsToFloat(packHalf2x16(_h.zw));\n";
|
||||
OS << " return " << float2 << "(x, y);\n";
|
||||
OS << "}\n\n";
|
||||
}
|
||||
|
||||
if (properties.has_dynamic_register_load)
|
||||
{
|
||||
OS <<
|
||||
@ -1149,6 +1115,14 @@ bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode)
|
||||
return true;
|
||||
case RSX_FP_OPCODE_PKB: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packUnorm4x8($0)))"); return true;
|
||||
case RSX_FP_OPCODE_SIN: SetDst("sin($0.xxxx)"); return true;
|
||||
|
||||
// Custom ISA extensions for 16-bit OR
|
||||
case RSX_FP_OPCODE_OR16_HI:
|
||||
SetDst("$float4(uintBitsToFloat((floatBitsToUint($0.x) & 0x0000ffff) | (packHalf2x16($1.xx) & 0xffff0000)))");
|
||||
return true;
|
||||
case RSX_FP_OPCODE_OR16_LO:
|
||||
SetDst("$float4(uintBitsToFloat((floatBitsToUint($0.x) & 0xffff0000) | (packHalf2x16($1.xx) & 0x0000ffff)))");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -1295,7 +1269,37 @@ bool FragmentProgramDecompiler::handle_tex_srb(u32 opcode)
|
||||
|
||||
std::string FragmentProgramDecompiler::Decompile()
|
||||
{
|
||||
const auto graph = rsx::assembler::deconstruct_fragment_program(m_prog);
|
||||
auto graph = deconstruct_fragment_program(m_prog);
|
||||
|
||||
if (!graph.blocks.empty())
|
||||
{
|
||||
// The RSX CFG is missing the output block. We inject a fake tail block that ingests the ROP outputs.
|
||||
BasicBlock* rop_block = nullptr;
|
||||
BasicBlock* tail_block = &graph.blocks.back();
|
||||
if (tail_block->instructions.empty())
|
||||
{
|
||||
// Merge block. Use this directly
|
||||
rop_block = tail_block;
|
||||
}
|
||||
else
|
||||
{
|
||||
graph.blocks.push_back({});
|
||||
rop_block = &graph.blocks.back();
|
||||
|
||||
tail_block->insert_succ(rop_block);
|
||||
rop_block->insert_pred(tail_block);
|
||||
}
|
||||
|
||||
const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count);
|
||||
rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end());
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ m_prog, { .skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass dependency_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
dependency_pass.run(graph);
|
||||
}
|
||||
|
||||
m_size = 0;
|
||||
m_location = 0;
|
||||
m_loop_count = 0;
|
||||
@ -1303,57 +1307,105 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
m_is_valid_ucode = true;
|
||||
m_constant_offsets.clear();
|
||||
|
||||
enum
|
||||
// For GLSL scope wind/unwind. We store the min scope depth and loop count for each block and "unwind" to it.
|
||||
// This should recover information lost when multiple nodes converge on a single merge node or even skip a merge node as is the case with "ELSE" nodes.
|
||||
std::unordered_map<const BasicBlock*, std::pair<int, u32>> block_data;
|
||||
|
||||
auto push_block_info = [&](const BasicBlock* block)
|
||||
{
|
||||
FORCE_NONE,
|
||||
FORCE_SCT,
|
||||
FORCE_SCB,
|
||||
u32 loop = m_loop_count;
|
||||
int level = m_code_level;
|
||||
|
||||
auto found = block_data.find(block);
|
||||
if (found != block_data.end())
|
||||
{
|
||||
level = std::min(level, found->second.first);
|
||||
loop = std::min(loop, found->second.second);
|
||||
}
|
||||
|
||||
block_data[block] = { level, loop };
|
||||
};
|
||||
|
||||
int forced_unit = FORCE_NONE;
|
||||
auto emit_block = [&](const std::vector<Instruction>& instructions)
|
||||
{
|
||||
for (auto& inst : instructions)
|
||||
{
|
||||
m_instruction = &inst;
|
||||
dst.HEX = inst.bytecode[0];
|
||||
src0.HEX = inst.bytecode[1];
|
||||
src1.HEX = inst.bytecode[2];
|
||||
src2.HEX = inst.bytecode[3];
|
||||
|
||||
ensure(handle_tex_srb(inst.opcode) || handle_sct_scb(inst.opcode), "Unsupported operation");
|
||||
}
|
||||
};
|
||||
|
||||
for (const auto &block : graph.blocks)
|
||||
{
|
||||
// TODO: Handle block prologue if any
|
||||
auto found = block_data.find(&block);
|
||||
if (found != block_data.end())
|
||||
{
|
||||
const auto [level, loop] = found->second;
|
||||
for (int i = m_code_level; i > level; i--)
|
||||
{
|
||||
m_code_level--;
|
||||
AddCode("}");
|
||||
}
|
||||
|
||||
m_loop_count = loop;
|
||||
}
|
||||
|
||||
if (!block.pred.empty())
|
||||
{
|
||||
// CFG guarantees predecessors are sorted, closest one first
|
||||
for (const auto& pred : block.pred)
|
||||
// Predecessors are always sorted closest last.
|
||||
// This gives some adjacency info and tells us how the previous block connects to this one.
|
||||
const auto& pred = block.pred.back();
|
||||
switch (pred.type)
|
||||
{
|
||||
switch (pred.type)
|
||||
{
|
||||
case rsx::assembler::EdgeType::ENDLOOP:
|
||||
m_loop_count--;
|
||||
[[ fallthrough ]];
|
||||
case rsx::assembler::EdgeType::ENDIF:
|
||||
m_code_level--;
|
||||
AddCode("}");
|
||||
break;
|
||||
case rsx::assembler::EdgeType::LOOP:
|
||||
m_loop_count++;
|
||||
[[ fallthrough ]];
|
||||
case rsx::assembler::EdgeType::IF:
|
||||
// Instruction will be inserted by the SIP decoder
|
||||
AddCode("{");
|
||||
m_code_level++;
|
||||
break;
|
||||
case rsx::assembler::EdgeType::ELSE:
|
||||
// This one needs more testing
|
||||
m_code_level--;
|
||||
AddCode("}");
|
||||
AddCode("else");
|
||||
AddCode("{");
|
||||
m_code_level++;
|
||||
break;
|
||||
default:
|
||||
// Start a new block anyway
|
||||
fmt::throw_exception("Unexpected block found");
|
||||
}
|
||||
case EdgeType::LOOP:
|
||||
m_loop_count++;
|
||||
[[ fallthrough ]];
|
||||
case EdgeType::IF:
|
||||
AddCode("{");
|
||||
m_code_level++;
|
||||
break;
|
||||
case EdgeType::ELSE:
|
||||
AddCode("else");
|
||||
AddCode("{");
|
||||
m_code_level++;
|
||||
break;
|
||||
case EdgeType::ENDIF:
|
||||
case EdgeType::ENDLOOP:
|
||||
// Pure merge block?
|
||||
break;
|
||||
case EdgeType::NONE:
|
||||
ensure(block.instructions.empty());
|
||||
break;
|
||||
default:
|
||||
fmt::throw_exception("Unhandled edge type %d", static_cast<int>(pred.type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!block.prologue.empty())
|
||||
{
|
||||
AddCode("// Prologue");
|
||||
emit_block(block.prologue);
|
||||
}
|
||||
|
||||
const bool early_epilogue =
|
||||
!block.epilogue.empty() &&
|
||||
!block.succ.empty() &&
|
||||
(block.succ.front().type == EdgeType::IF || block.succ.front().type == EdgeType::LOOP);
|
||||
|
||||
for (const auto& inst : block.instructions)
|
||||
{
|
||||
if (early_epilogue && &inst == &block.instructions.back())
|
||||
{
|
||||
AddCode("// Epilogue");
|
||||
emit_block(block.epilogue);
|
||||
}
|
||||
|
||||
m_instruction = &inst;
|
||||
|
||||
dst.HEX = inst.bytecode[0];
|
||||
@ -1363,11 +1415,9 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
|
||||
opflags = 0;
|
||||
|
||||
const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6);
|
||||
|
||||
auto SIP = [&]()
|
||||
{
|
||||
switch (opcode)
|
||||
switch (m_instruction->opcode)
|
||||
{
|
||||
case RSX_FP_OPCODE_BRK:
|
||||
if (m_loop_count) AddFlowOp("break");
|
||||
@ -1377,12 +1427,10 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
rsx_log.error("Unimplemented SIP instruction: CAL");
|
||||
break;
|
||||
case RSX_FP_OPCODE_FENCT:
|
||||
AddCode("//FENCT");
|
||||
forced_unit = FORCE_SCT;
|
||||
AddCode("// FENCT");
|
||||
break;
|
||||
case RSX_FP_OPCODE_FENCB:
|
||||
AddCode("//FENCB");
|
||||
forced_unit = FORCE_SCB;
|
||||
AddCode("// FENCB");
|
||||
break;
|
||||
case RSX_FP_OPCODE_IFE:
|
||||
AddCode("if($cond)");
|
||||
@ -1406,7 +1454,7 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
return true;
|
||||
};
|
||||
|
||||
switch (opcode)
|
||||
switch (m_instruction->opcode)
|
||||
{
|
||||
case RSX_FP_OPCODE_NOP:
|
||||
break;
|
||||
@ -1415,19 +1463,10 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
AddFlowOp("_kill()");
|
||||
break;
|
||||
default:
|
||||
int prev_force_unit = forced_unit;
|
||||
|
||||
// Some instructions do not respect forced unit
|
||||
// Tested with Tales of Vesperia
|
||||
if (SIP()) break;
|
||||
if (handle_tex_srb(opcode)) break;
|
||||
|
||||
// FENCT/FENCB do not actually reject instructions if they dont match the forced unit
|
||||
// Looks like they are optimization hints and not hard-coded forced paths
|
||||
if (handle_sct_scb(opcode)) break;
|
||||
forced_unit = FORCE_NONE;
|
||||
|
||||
rsx_log.error("Unknown/illegal instruction: 0x%x (forced unit %d)", opcode, prev_force_unit);
|
||||
if (handle_tex_srb(m_instruction->opcode)) break;
|
||||
if (handle_sct_scb(m_instruction->opcode)) break;
|
||||
rsx_log.error("Unknown/illegal instruction: 0x%x", m_instruction->opcode);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1435,16 +1474,28 @@ std::string FragmentProgramDecompiler::Decompile()
|
||||
if (dst.end) break;
|
||||
}
|
||||
|
||||
// TODO: Handle block epilogue if needed
|
||||
if (!early_epilogue && !block.epilogue.empty())
|
||||
{
|
||||
AddCode("// Epilogue");
|
||||
emit_block(block.epilogue);
|
||||
}
|
||||
|
||||
for (auto& succ : block.succ)
|
||||
{
|
||||
switch (succ.type)
|
||||
{
|
||||
case EdgeType::ENDIF:
|
||||
case EdgeType::ENDLOOP:
|
||||
case EdgeType::ELSE:
|
||||
push_block_info(succ.to);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (m_code_level > 1)
|
||||
{
|
||||
rsx_log.error("Hanging block found at end of shader. Malformed shader?");
|
||||
|
||||
m_code_level--;
|
||||
AddCode("}");
|
||||
}
|
||||
ensure(m_code_level == 1);
|
||||
|
||||
// flush m_code_level
|
||||
m_code_level = 1;
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
#include "ShaderParam.h"
|
||||
#include "FragmentProgramRegister.h"
|
||||
#include "RSXFragmentProgram.h"
|
||||
|
||||
#include "Assembler/CFG.h"
|
||||
@ -53,8 +52,6 @@ class FragmentProgramDecompiler
|
||||
int m_code_level;
|
||||
std::unordered_map<u32, u32> m_constant_offsets;
|
||||
|
||||
std::array<rsx::MixedPrecisionRegister, 64> temp_registers;
|
||||
|
||||
std::string GetMask() const;
|
||||
|
||||
void SetDst(std::string code, u32 flags = 0);
|
||||
@ -175,7 +172,6 @@ public:
|
||||
|
||||
// Decoded properties (out)
|
||||
bool has_lit_op = false;
|
||||
bool has_gather_op = false;
|
||||
bool has_no_output = false;
|
||||
bool has_discard_op = false;
|
||||
bool has_tex_op = false;
|
||||
|
||||
@ -1,196 +0,0 @@
|
||||
#include "stdafx.h"
|
||||
#include "FragmentProgramRegister.h"
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
MixedPrecisionRegister::MixedPrecisionRegister()
|
||||
{
|
||||
std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined);
|
||||
}
|
||||
|
||||
void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w)
|
||||
{
|
||||
if (x) content_mask[0] = data_type_bits::f16;
|
||||
if (y) content_mask[1] = data_type_bits::f16;
|
||||
if (z) content_mask[2] = data_type_bits::f16;
|
||||
if (w) content_mask[3] = data_type_bits::f16;
|
||||
}
|
||||
|
||||
void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w)
|
||||
{
|
||||
if (x) content_mask[4] = data_type_bits::f16;
|
||||
if (y) content_mask[5] = data_type_bits::f16;
|
||||
if (z) content_mask[6] = data_type_bits::f16;
|
||||
if (w) content_mask[7] = data_type_bits::f16;
|
||||
}
|
||||
|
||||
void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w)
|
||||
{
|
||||
if (x) content_mask[0] = content_mask[1] = data_type_bits::f32;
|
||||
if (y) content_mask[2] = content_mask[3] = data_type_bits::f32;
|
||||
if (z) content_mask[4] = content_mask[5] = data_type_bits::f32;
|
||||
if (w) content_mask[6] = content_mask[7] = data_type_bits::f32;
|
||||
}
|
||||
|
||||
void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w)
|
||||
{
|
||||
if (file_index == umax)
|
||||
{
|
||||
// First-time use. Initialize...
|
||||
const u32 real_index = is_fp16 ? (index >> 1) : index;
|
||||
file_index = real_index;
|
||||
}
|
||||
|
||||
if (is_fp16)
|
||||
{
|
||||
ensure((index / 2) == file_index);
|
||||
|
||||
if (index & 1)
|
||||
{
|
||||
tag_h1(x, y, z, w);
|
||||
return;
|
||||
}
|
||||
|
||||
tag_h0(x, y, z, w);
|
||||
return;
|
||||
}
|
||||
|
||||
tag_r(x, y, z, w);
|
||||
}
|
||||
|
||||
std::string MixedPrecisionRegister::gather_r() const
|
||||
{
|
||||
const auto half_index = file_index << 1;
|
||||
const std::string reg = "r" + std::to_string(file_index);
|
||||
const std::string gather_half_regs[] = {
|
||||
"gather(h" + std::to_string(half_index) + ")",
|
||||
"gather(h" + std::to_string(half_index + 1) + ")"
|
||||
};
|
||||
|
||||
std::string outputs[4];
|
||||
for (int ch = 0; ch < 4; ++ch)
|
||||
{
|
||||
// FIXME: This approach ignores mixed register bits. Not ideal!!!!
|
||||
const auto channel0 = content_mask[ch * 2];
|
||||
const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16;
|
||||
outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg;
|
||||
}
|
||||
|
||||
// Grouping. Only replace relevant bits...
|
||||
if (outputs[0] == outputs[1]) outputs[0] = "";
|
||||
if (outputs[2] == outputs[3]) outputs[2] = "";
|
||||
|
||||
// Assemble
|
||||
bool group = false;
|
||||
std::string result = "";
|
||||
constexpr std::string_view swz_mask = "xyzw";
|
||||
|
||||
for (int ch = 0; ch < 4; ++ch)
|
||||
{
|
||||
if (outputs[ch].empty())
|
||||
{
|
||||
group = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!result.empty())
|
||||
{
|
||||
result += ", ";
|
||||
}
|
||||
|
||||
if (group)
|
||||
{
|
||||
ensure(ch > 0);
|
||||
group = false;
|
||||
|
||||
if (outputs[ch] == reg)
|
||||
{
|
||||
result += reg + "." + swz_mask[ch - 1] + swz_mask[ch];
|
||||
continue;
|
||||
}
|
||||
|
||||
result += outputs[ch];
|
||||
continue;
|
||||
}
|
||||
|
||||
const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
|
||||
result += outputs[ch] + "." + swz_mask[subch];
|
||||
}
|
||||
|
||||
// Optimize dual-gather (128-bit gather) to use special function
|
||||
const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1];
|
||||
if (result == double_gather)
|
||||
{
|
||||
result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")";
|
||||
}
|
||||
|
||||
return "(" + result + ")";
|
||||
}
|
||||
|
||||
std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const
|
||||
{
|
||||
// Reads half-word 0 (H16x4) from a full real (R32x4) register
|
||||
constexpr std::string_view swz_mask = "xyzw";
|
||||
const std::string reg = "r" + std::to_string(file_index);
|
||||
const std::string hreg = "h" + std::to_string(file_index * 2 + word_index);
|
||||
|
||||
const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")";
|
||||
const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")";
|
||||
const std::string words[] = {
|
||||
"unpackHalf2x16(" + word0_bits + ")",
|
||||
"unpackHalf2x16(" + word1_bits + ")"
|
||||
};
|
||||
|
||||
// Assemble
|
||||
std::string outputs[4];
|
||||
|
||||
ensure(word_index <= 1);
|
||||
const int word_offset = word_index * 4;
|
||||
for (int ch = 0; ch < 4; ++ch)
|
||||
{
|
||||
outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32
|
||||
? words[ch / 2]
|
||||
: hreg;
|
||||
}
|
||||
|
||||
// Grouping. Only replace relevant bits...
|
||||
if (outputs[0] == outputs[1]) outputs[0] = "";
|
||||
if (outputs[2] == outputs[3]) outputs[2] = "";
|
||||
|
||||
// Assemble
|
||||
bool group = false;
|
||||
std::string result = "";
|
||||
|
||||
for (int ch = 0; ch < 4; ++ch)
|
||||
{
|
||||
if (outputs[ch].empty())
|
||||
{
|
||||
group = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!result.empty())
|
||||
{
|
||||
result += ", ";
|
||||
}
|
||||
|
||||
if (group)
|
||||
{
|
||||
ensure(ch > 0);
|
||||
group = false;
|
||||
result += outputs[ch];
|
||||
|
||||
if (outputs[ch] == hreg)
|
||||
{
|
||||
result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
|
||||
result += outputs[ch] + "." + swz_mask[subch];
|
||||
}
|
||||
|
||||
return "(" + result + ")";
|
||||
}
|
||||
}
|
||||
@ -1,111 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <util/types.hpp>
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
class MixedPrecisionRegister
|
||||
{
|
||||
enum data_type_bits
|
||||
{
|
||||
undefined = 0,
|
||||
f16 = 1,
|
||||
f32 = 2
|
||||
};
|
||||
|
||||
std::array<data_type_bits, 8> content_mask; // Content details for each half-word
|
||||
u32 file_index = umax;
|
||||
|
||||
void tag_h0(bool x, bool y, bool z, bool w);
|
||||
|
||||
void tag_h1(bool x, bool y, bool z, bool w);
|
||||
|
||||
void tag_r(bool x, bool y, bool z, bool w);
|
||||
|
||||
std::string fetch_halfreg(u32 word_index) const;
|
||||
|
||||
public:
|
||||
MixedPrecisionRegister();
|
||||
|
||||
void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w);
|
||||
|
||||
std::string gather_r() const;
|
||||
|
||||
std::string split_h0() const
|
||||
{
|
||||
return fetch_halfreg(0);
|
||||
}
|
||||
|
||||
std::string split_h1() const
|
||||
{
|
||||
return fetch_halfreg(1);
|
||||
}
|
||||
|
||||
// Getters
|
||||
|
||||
// Return true if all values are unwritten to (undefined)
|
||||
bool floating() const
|
||||
{
|
||||
return file_index == umax;
|
||||
}
|
||||
|
||||
// Return true if the first half register is all undefined
|
||||
bool floating_h0() const
|
||||
{
|
||||
return content_mask[0] == content_mask[1] &&
|
||||
content_mask[1] == content_mask[2] &&
|
||||
content_mask[2] == content_mask[3] &&
|
||||
content_mask[3] == data_type_bits::undefined;
|
||||
}
|
||||
|
||||
// Return true if the second half register is all undefined
|
||||
bool floating_h1() const
|
||||
{
|
||||
return content_mask[4] == content_mask[5] &&
|
||||
content_mask[5] == content_mask[6] &&
|
||||
content_mask[6] == content_mask[7] &&
|
||||
content_mask[7] == data_type_bits::undefined;
|
||||
}
|
||||
|
||||
// Return true if any of the half-words are 16-bit
|
||||
bool requires_gather(u8 channel) const
|
||||
{
|
||||
// Data fetched from the single precision register requires merging of the two half registers
|
||||
const auto channel_offset = channel * 2;
|
||||
ensure(channel_offset <= 6);
|
||||
|
||||
return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16);
|
||||
}
|
||||
|
||||
// Return true if the entire 128-bit register is filled with 2xfp16x4 data words
|
||||
bool requires_gather128() const
|
||||
{
|
||||
// Full 128-bit check
|
||||
for (const auto& ch : content_mask)
|
||||
{
|
||||
if (ch == data_type_bits::f16)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Return true if the half-register is polluted with fp32 data
|
||||
bool requires_split(u32 word_index) const
|
||||
{
|
||||
const u32 content_offset = word_index * 4;
|
||||
for (u32 i = 0; i < 4; ++i)
|
||||
{
|
||||
if (content_mask[content_offset + i] == data_type_bits::f32)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "program_util.h"
|
||||
#include "Assembler/FPOpcodes.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -23,76 +24,7 @@ enum register_precision
|
||||
RSX_FP_PRECISION_UNKNOWN = 5 // Unknown what this actually does; seems to do nothing on hwtests but then why would their compiler emit it?
|
||||
};
|
||||
|
||||
enum fp_opcode
|
||||
{
|
||||
RSX_FP_OPCODE_NOP = 0x00, // No-Operation
|
||||
RSX_FP_OPCODE_MOV = 0x01, // Move
|
||||
RSX_FP_OPCODE_MUL = 0x02, // Multiply
|
||||
RSX_FP_OPCODE_ADD = 0x03, // Add
|
||||
RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add
|
||||
RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product
|
||||
RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product
|
||||
RSX_FP_OPCODE_DST = 0x07, // Distance
|
||||
RSX_FP_OPCODE_MIN = 0x08, // Minimum
|
||||
RSX_FP_OPCODE_MAX = 0x09, // Maximum
|
||||
RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan
|
||||
RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual
|
||||
RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual
|
||||
RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan
|
||||
RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual
|
||||
RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal
|
||||
RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract)
|
||||
RSX_FP_OPCODE_FLR = 0x11, // Floor
|
||||
RSX_FP_OPCODE_KIL = 0x12, // Kill fragment
|
||||
RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values
|
||||
RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values
|
||||
RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x)
|
||||
RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y)
|
||||
RSX_FP_OPCODE_TEX = 0x17, // Texture lookup
|
||||
RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup)
|
||||
RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives)
|
||||
RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal
|
||||
RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root
|
||||
RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2
|
||||
RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2
|
||||
RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients
|
||||
RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation
|
||||
RSX_FP_OPCODE_STR = 0x20, // Set-If-True
|
||||
RSX_FP_OPCODE_SFL = 0x21, // Set-If-False
|
||||
RSX_FP_OPCODE_COS = 0x22, // Cosine
|
||||
RSX_FP_OPCODE_SIN = 0x23, // Sine
|
||||
RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats
|
||||
RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats
|
||||
RSX_FP_OPCODE_POW = 0x26, // Power
|
||||
RSX_FP_OPCODE_PKB = 0x27, // Pack bytes
|
||||
RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes
|
||||
RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits
|
||||
RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16
|
||||
RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform)
|
||||
RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation
|
||||
RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma
|
||||
RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition
|
||||
RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD
|
||||
RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias
|
||||
RSX_FP_OPCODE_TEXBEM = 0x33,
|
||||
RSX_FP_OPCODE_TXPBEM = 0x34,
|
||||
RSX_FP_OPCODE_BEMLUM = 0x35,
|
||||
RSX_FP_OPCODE_REFL = 0x36, // Reflection vector
|
||||
RSX_FP_OPCODE_TIMESWTEX = 0x37,
|
||||
RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product
|
||||
RSX_FP_OPCODE_NRM = 0x39, // Normalize
|
||||
RSX_FP_OPCODE_DIV = 0x3A, // Division
|
||||
RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root
|
||||
RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT
|
||||
RSX_FP_OPCODE_FENCT = 0x3D, // Fence T?
|
||||
RSX_FP_OPCODE_FENCB = 0x3E, // Fence B?
|
||||
RSX_FP_OPCODE_BRK = 0x40, // Break
|
||||
RSX_FP_OPCODE_CAL = 0x41, // Subroutine call
|
||||
RSX_FP_OPCODE_IFE = 0x42, // If
|
||||
RSX_FP_OPCODE_LOOP = 0x43, // Loop
|
||||
RSX_FP_OPCODE_REP = 0x44, // Repeat
|
||||
RSX_FP_OPCODE_RET = 0x45 // Return
|
||||
};
|
||||
using enum rsx::assembler::FP_opcode;
|
||||
|
||||
union OPDEST
|
||||
{
|
||||
@ -116,6 +48,12 @@ union OPDEST
|
||||
u32 no_dest : 1;
|
||||
u32 saturate : 1; // _sat
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
u32 : 9;
|
||||
u32 write_mask : 4;
|
||||
};
|
||||
};
|
||||
|
||||
union SRC0
|
||||
@ -164,7 +102,7 @@ union SRC1
|
||||
u32 src1_prec_mod : 3; // Precision modifier for src1 (CoD:MW series)
|
||||
u32 src2_prec_mod : 3; // Precision modifier for src2 (unproven, should affect MAD instruction)
|
||||
u32 scale : 3;
|
||||
u32 opcode_is_branch : 1;
|
||||
u32 opcode_hi : 1; // Opcode high bit
|
||||
};
|
||||
|
||||
struct
|
||||
@ -207,6 +145,23 @@ union SRC2
|
||||
};
|
||||
};
|
||||
|
||||
union SRC_Common
|
||||
{
|
||||
u32 HEX;
|
||||
|
||||
struct
|
||||
{
|
||||
u32 reg_type : 2;
|
||||
u32 tmp_reg_index : 6;
|
||||
u32 fp16 : 1;
|
||||
u32 swizzle_x : 2;
|
||||
u32 swizzle_y : 2;
|
||||
u32 swizzle_z : 2;
|
||||
u32 swizzle_w : 2;
|
||||
u32 neg : 1;
|
||||
};
|
||||
};
|
||||
|
||||
constexpr const char* rsx_fp_input_attr_regs[] =
|
||||
{
|
||||
"WPOS", "COL0", "COL1", "FOGC", "TEX0",
|
||||
|
||||
@ -157,8 +157,11 @@
|
||||
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPASM.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPOpcodes.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\program_util.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp" />
|
||||
@ -703,8 +706,11 @@
|
||||
<ClInclude Include="Emu\RSX\Overlays\overlay_video.h" />
|
||||
<ClInclude Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\CFG.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\FPASM.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\FPOpcodes.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\IR.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\GLSLTypes.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" />
|
||||
<ClInclude Include="Emu\RSX\Program\program_util.h" />
|
||||
|
||||
@ -136,6 +136,12 @@
|
||||
<Filter Include="Emu\GPU\RSX\Program\Assembler">
|
||||
<UniqueIdentifier>{d99df916-8a99-428b-869a-9f14ac0ab411}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Emu\GPU\RSX\Program\Assembler\Passes">
|
||||
<UniqueIdentifier>{d13db076-47e4-45b9-bb8a-6b711ea40622}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Emu\GPU\RSX\Program\Assembler\Passes\FP">
|
||||
<UniqueIdentifier>{7fb59544-9761-4b4a-bb04-07deb43cf3c2}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="Crypto\aes.cpp">
|
||||
@ -1354,9 +1360,6 @@
|
||||
<ClCompile Include="Emu\Cell\ErrorCodes.cpp">
|
||||
<Filter>Emu\Cell</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="util\emu_utils.cpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClCompile>
|
||||
@ -1378,6 +1381,18 @@
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPOpcodes.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\Program\Assembler\FPASM.cpp">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Crypto\aes.h">
|
||||
@ -2746,9 +2761,6 @@
|
||||
<ClInclude Include="Emu\Audio\audio_utils.h">
|
||||
<Filter>Emu\Audio</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h">
|
||||
<Filter>Emu\GPU\RSX\Program</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="util\video_source.h">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
@ -2776,6 +2788,18 @@
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\IR.h">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\FPOpcodes.h">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Program\Assembler\FPASM.h">
|
||||
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">
|
||||
|
||||
@ -89,6 +89,7 @@
|
||||
<ClCompile Include="test.cpp" />
|
||||
<ClCompile Include="test_fmt.cpp" />
|
||||
<ClCompile Include="test_rsx_cfg.cpp" />
|
||||
<ClCompile Include="test_rsx_fp_asm.cpp" />
|
||||
<ClCompile Include="test_simple_array.cpp" />
|
||||
<ClCompile Include="test_address_range.cpp" />
|
||||
<ClCompile Include="test_tuple.cpp" />
|
||||
|
||||
@ -2,89 +2,28 @@
|
||||
|
||||
#include "Emu/RSX/Common/simple_array.hpp"
|
||||
#include "Emu/RSX/Program/Assembler/CFG.h"
|
||||
#include "Emu/RSX/Program/Assembler/FPASM.h"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
#include <util/v128.hpp>
|
||||
|
||||
namespace rsx::assembler
|
||||
{
|
||||
auto swap_bytes16 = [](u32 dword) -> u32
|
||||
static const BasicBlock* get_graph_block_by_id(const FlowGraph& graph, u32 id)
|
||||
{
|
||||
// Lazy encode, but good enough for what we need here.
|
||||
union v32
|
||||
{
|
||||
u32 HEX;
|
||||
u8 _v[4];
|
||||
};
|
||||
|
||||
u8* src_bytes = reinterpret_cast<u8*>(&dword);
|
||||
v32 dst_bytes;
|
||||
|
||||
dst_bytes._v[0] = src_bytes[1];
|
||||
dst_bytes._v[1] = src_bytes[0];
|
||||
dst_bytes._v[2] = src_bytes[3];
|
||||
dst_bytes._v[3] = src_bytes[2];
|
||||
|
||||
return dst_bytes.HEX;
|
||||
};
|
||||
|
||||
// Instruction mocks because we don't have a working assember (yet)
|
||||
auto encode_instruction = [](u32 opcode, bool end = false) -> v128
|
||||
{
|
||||
OPDEST dst{};
|
||||
dst.opcode = opcode;
|
||||
|
||||
if (end)
|
||||
{
|
||||
dst.end = 1;
|
||||
}
|
||||
|
||||
return v128::from32(swap_bytes16(dst.HEX), 0, 0, 0);
|
||||
};
|
||||
|
||||
auto create_if(u32 end, u32 _else = 0)
|
||||
{
|
||||
OPDEST dst{};
|
||||
dst.opcode = RSX_FP_OPCODE_IFE & 0x3Fu;
|
||||
|
||||
SRC1 src1{};
|
||||
src1.else_offset = (_else ? _else : end) << 2;
|
||||
src1.opcode_is_branch = 1;
|
||||
|
||||
SRC2 src2{};
|
||||
src2.end_offset = end << 2;
|
||||
|
||||
return v128::from32(swap_bytes16(dst.HEX), 0, swap_bytes16(src1.HEX), swap_bytes16(src2.HEX));
|
||||
};
|
||||
|
||||
TEST(CFG, FpToCFG_Basic)
|
||||
{
|
||||
rsx::simple_array<v128> buffer = {
|
||||
encode_instruction(RSX_FP_OPCODE_ADD),
|
||||
encode_instruction(RSX_FP_OPCODE_MOV, true)
|
||||
};
|
||||
|
||||
RSXFragmentProgram program{};
|
||||
program.data = buffer.data();
|
||||
|
||||
FlowGraph graph = deconstruct_fragment_program(program);
|
||||
|
||||
EXPECT_EQ(graph.blocks.size(), 1);
|
||||
EXPECT_EQ(graph.blocks.front().instructions.size(), 2);
|
||||
EXPECT_EQ(graph.blocks.front().instructions.front().length, 4);
|
||||
EXPECT_EQ(graph.blocks.front().instructions[0].addr, 0);
|
||||
EXPECT_EQ(graph.blocks.front().instructions[1].addr, 16);
|
||||
auto found = std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == id));
|
||||
return &(*found);
|
||||
}
|
||||
|
||||
TEST(CFG, FpToCFG_IF)
|
||||
{
|
||||
rsx::simple_array<v128> buffer = {
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 0
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 1
|
||||
create_if(4), // 2 (BR, 4)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 3
|
||||
encode_instruction(RSX_FP_OPCODE_MOV, true), // 4 (Merge block)
|
||||
};
|
||||
auto ir = FPIR::from_source(R"(
|
||||
ADD R0, R0, R0;
|
||||
MOV R1, R0;
|
||||
IF.LT;
|
||||
ADD R1, R1, R0;
|
||||
ENDIF;
|
||||
MOV R0, R1;
|
||||
)");
|
||||
|
||||
const std::pair<int, size_t> expected_block_data[3] = {
|
||||
{ 0, 3 }, // Head
|
||||
@ -93,7 +32,8 @@ namespace rsx::assembler
|
||||
};
|
||||
|
||||
RSXFragmentProgram program{};
|
||||
program.data = buffer.data();
|
||||
auto bytecode = ir.compile();
|
||||
program.data = bytecode.data();
|
||||
|
||||
FlowGraph graph = deconstruct_fragment_program(program);
|
||||
|
||||
@ -108,24 +48,26 @@ namespace rsx::assembler
|
||||
}
|
||||
|
||||
// Check edges
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 3))->pred[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 4))->pred[0].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(get_graph_block_by_id(graph, 3)->pred[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(get_graph_block_by_id(graph, 0)->succ[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(get_graph_block_by_id(graph, 4)->pred[0].type, EdgeType::ENDIF);
|
||||
}
|
||||
|
||||
TEST(CFG, FpToCFG_NestedIF)
|
||||
{
|
||||
rsx::simple_array<v128> buffer = {
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 0
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 1
|
||||
create_if(8), // 2 (BR, 8)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 3
|
||||
create_if(6), // 4 (BR, 6)
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 5
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 6 (merge block 1)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 7
|
||||
encode_instruction(RSX_FP_OPCODE_MOV, true) // 8 (merge block 2
|
||||
};
|
||||
auto ir = FPIR::from_source(
|
||||
"ADD R0, R0, R0;" // 0
|
||||
"MOV R1, R0;" // 1
|
||||
"IF.LT;" // 2 (BR, 8)
|
||||
" ADD R1, R1, R0;" // 3
|
||||
" IF.GT;" // 4 (BR, 6)
|
||||
" MOV R3, R0;" // 5
|
||||
" ENDIF;"
|
||||
" MOV R2, R3;" // 6 (merge block 1)
|
||||
" ADD R1, R2, R1;" // 7
|
||||
"ENDIF;"
|
||||
"MOV R0, R1;" // 8 (merge block 2
|
||||
);
|
||||
|
||||
const std::pair<int, size_t> expected_block_data[5] = {
|
||||
{ 0, 3 }, // Head
|
||||
@ -136,7 +78,8 @@ namespace rsx::assembler
|
||||
};
|
||||
|
||||
RSXFragmentProgram program{};
|
||||
program.data = buffer.data();
|
||||
auto bytecode = ir.compile();
|
||||
program.data = bytecode.data();
|
||||
|
||||
FlowGraph graph = deconstruct_fragment_program(program);
|
||||
|
||||
@ -153,17 +96,19 @@ namespace rsx::assembler
|
||||
|
||||
TEST(CFG, FpToCFG_NestedIF_MultiplePred)
|
||||
{
|
||||
rsx::simple_array<v128> buffer = {
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 0
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 1
|
||||
create_if(6), // 2 (BR, 6)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 3
|
||||
create_if(6), // 4 (BR, 6)
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 5
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 6 (merge block)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 7
|
||||
encode_instruction(RSX_FP_OPCODE_MOV, true) // 8
|
||||
};
|
||||
auto ir = FPIR::from_source(
|
||||
"ADD R0, R0, R0;" // 0
|
||||
"MOV R1, R0;" // 1
|
||||
"IF.LT;" // 2 (BR, 6)
|
||||
" ADD R1, R1, R0;" // 3
|
||||
" IF.GT;" // 4 (BR, 6)
|
||||
" MOV R3, R0;" // 5
|
||||
" ENDIF;" // ENDIF (4)
|
||||
"ENDIF;" // ENDIF (2)
|
||||
"MOV R2, R3;" // 6 (merge block, unified)
|
||||
"ADD R1, R2, R1;" // 7
|
||||
"MOV R0, R1;" // 8
|
||||
);
|
||||
|
||||
const std::pair<int, size_t> expected_block_data[4] = {
|
||||
{ 0, 3 }, // Head
|
||||
@ -173,7 +118,8 @@ namespace rsx::assembler
|
||||
};
|
||||
|
||||
RSXFragmentProgram program{};
|
||||
program.data = buffer.data();
|
||||
auto bytecode = ir.compile();
|
||||
program.data = bytecode.data();
|
||||
|
||||
FlowGraph graph = deconstruct_fragment_program(program);
|
||||
|
||||
@ -187,32 +133,40 @@ namespace rsx::assembler
|
||||
EXPECT_EQ(it->instructions.size(), expected.second);
|
||||
}
|
||||
|
||||
const BasicBlock
|
||||
*bb0 = get_graph_block_by_id(graph, 0),
|
||||
*bb6 = get_graph_block_by_id(graph, 6);
|
||||
|
||||
// Predecessors must be ordered, closest first
|
||||
ASSERT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred.size(), 2);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[0].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[0].from->id, 3);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[1].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 6))->pred[1].from->id, 0);
|
||||
ASSERT_EQ(bb6->pred.size(), 3);
|
||||
EXPECT_EQ(bb6->pred[0].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(bb6->pred[0].from->id, 5);
|
||||
EXPECT_EQ(bb6->pred[1].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(bb6->pred[1].from->id, 3);
|
||||
EXPECT_EQ(bb6->pred[2].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(bb6->pred[2].from->id, 0);
|
||||
|
||||
// Successors must also be ordered, closest first
|
||||
ASSERT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ.size(), 2);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[0].to->id, 3);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[1].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(std::find_if(graph.blocks.begin(), graph.blocks.end(), FN(x.id == 0))->succ[1].to->id, 6);
|
||||
ASSERT_EQ(bb0->succ.size(), 2);
|
||||
EXPECT_EQ(bb0->succ[0].type, EdgeType::IF);
|
||||
EXPECT_EQ(bb0->succ[0].to->id, 3);
|
||||
EXPECT_EQ(bb0->succ[1].type, EdgeType::ENDIF);
|
||||
EXPECT_EQ(bb0->succ[1].to->id, 6);
|
||||
}
|
||||
|
||||
TEST(CFG, FpToCFG_IF_ELSE)
|
||||
{
|
||||
rsx::simple_array<v128> buffer = {
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 0
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 1
|
||||
create_if(6, 4), // 2 (BR, 6)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 3
|
||||
encode_instruction(RSX_FP_OPCODE_MOV), // 4 (Else)
|
||||
encode_instruction(RSX_FP_OPCODE_ADD), // 5
|
||||
encode_instruction(RSX_FP_OPCODE_MOV, true), // 6 (Merge)
|
||||
};
|
||||
auto ir = FPIR::from_source(
|
||||
"ADD R0, R0, R0;" // 0
|
||||
"MOV R1, R0;" // 1
|
||||
"IF.LT;" // 2 (BR, 6)
|
||||
" ADD R1, R1, R0;" // 3
|
||||
"ELSE;" // ELSE (2)
|
||||
" MOV R2, R3;" // 4
|
||||
" ADD R1, R2, R1;" // 5
|
||||
"ENDIF;" // ENDIF (2)
|
||||
"MOV R0, R1;" // 6 (merge)
|
||||
);
|
||||
|
||||
const std::pair<int, size_t> expected_block_data[4] = {
|
||||
{ 0, 3 }, // Head
|
||||
@ -222,7 +176,8 @@ namespace rsx::assembler
|
||||
};
|
||||
|
||||
RSXFragmentProgram program{};
|
||||
program.data = buffer.data();
|
||||
auto bytecode = ir.compile();
|
||||
program.data = bytecode.data();
|
||||
|
||||
FlowGraph graph = deconstruct_fragment_program(program);
|
||||
|
||||
@ -235,5 +190,24 @@ namespace rsx::assembler
|
||||
EXPECT_EQ(it->id, expected.first);
|
||||
EXPECT_EQ(it->instructions.size(), expected.second);
|
||||
}
|
||||
|
||||
// The IF and ELSE branches don't link to each other directly. Their predecessor should point to both and they both point to the merge.
|
||||
const BasicBlock
|
||||
*bb0 = get_graph_block_by_id(graph, 0),
|
||||
*bb3 = get_graph_block_by_id(graph, 3),
|
||||
*bb4 = get_graph_block_by_id(graph, 4),
|
||||
*bb6 = get_graph_block_by_id(graph, 6);
|
||||
|
||||
EXPECT_EQ(bb0->succ.size(), 3);
|
||||
EXPECT_EQ(bb3->succ.size(), 1);
|
||||
EXPECT_EQ(bb4->succ.size(), 1);
|
||||
|
||||
EXPECT_EQ(bb3->succ.front().to, bb6);
|
||||
EXPECT_EQ(bb4->succ.front().to, bb6);
|
||||
|
||||
EXPECT_EQ(bb6->pred.size(), 3);
|
||||
EXPECT_EQ(bb6->pred[0].from, bb4);
|
||||
EXPECT_EQ(bb6->pred[1].from, bb3);
|
||||
EXPECT_EQ(bb6->pred[2].from, bb0);
|
||||
}
|
||||
}
|
||||
|
||||
761
rpcs3/tests/test_rsx_fp_asm.cpp
Normal file
761
rpcs3/tests/test_rsx_fp_asm.cpp
Normal file
@ -0,0 +1,761 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "Emu/RSX/Common/simple_array.hpp"
|
||||
#include "Emu/RSX/Program/Assembler/FPASM.h"
|
||||
#include "Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h"
|
||||
#include "Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h"
|
||||
#include "Emu/RSX/Program/RSXFragmentProgram.h"
|
||||
|
||||
namespace rsx::assembler
|
||||
{
|
||||
#define DECLARE_REG32(num)\
|
||||
Register R##num{ .id = num, .f16 = false }
|
||||
|
||||
#define DECLARE_REG16(num)\
|
||||
Register H##num{ .id = num, .f16 = true }
|
||||
|
||||
DECLARE_REG32(0);
|
||||
DECLARE_REG32(1);
|
||||
DECLARE_REG32(2);
|
||||
DECLARE_REG32(3);
|
||||
DECLARE_REG32(4);
|
||||
DECLARE_REG32(5);
|
||||
DECLARE_REG32(6);
|
||||
DECLARE_REG32(7);
|
||||
DECLARE_REG32(8);
|
||||
|
||||
DECLARE_REG16(0);
|
||||
DECLARE_REG16(1);
|
||||
DECLARE_REG16(2);
|
||||
DECLARE_REG16(3);
|
||||
DECLARE_REG16(4);
|
||||
DECLARE_REG16(5);
|
||||
DECLARE_REG16(6);
|
||||
DECLARE_REG16(7);
|
||||
DECLARE_REG16(8);
|
||||
|
||||
#undef DECLARE_REG32
|
||||
#undef DECLARE_REG16
|
||||
|
||||
static const BasicBlock* get_graph_block(const FlowGraph& graph, u32 index)
|
||||
{
|
||||
ensure(index < graph.blocks.size());
|
||||
for (auto it = graph.blocks.begin(); it != graph.blocks.end(); ++it)
|
||||
{
|
||||
if (!index)
|
||||
{
|
||||
return &(*it);
|
||||
}
|
||||
index--;
|
||||
}
|
||||
return nullptr;
|
||||
};
|
||||
|
||||
static FlowGraph CFG_from_source(const std::string& asm_)
|
||||
{
|
||||
auto ir = FPIR::from_source(asm_);
|
||||
|
||||
FlowGraph graph{};
|
||||
graph.blocks.push_back({});
|
||||
|
||||
auto& bb = graph.blocks.back();
|
||||
bb.instructions = ir.instructions();
|
||||
return graph;
|
||||
}
|
||||
|
||||
TEST(TestFPIR, FromSource)
|
||||
{
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV R0, #{ 0.125 };
|
||||
ADD R1, R0, R0;
|
||||
)");
|
||||
|
||||
const auto instructions = ir.instructions();
|
||||
|
||||
ASSERT_EQ(instructions.size(), 2);
|
||||
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.end, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.opcode, RSX_FP_OPCODE_MOV);
|
||||
EXPECT_EQ(SRC0{ .HEX = instructions[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_CONSTANT);
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[0].bytecode[0] }.opcode, RSX_FP_OPCODE_MOV);
|
||||
EXPECT_EQ(instructions[0].length, 8);
|
||||
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.end, 1);
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.opcode, RSX_FP_OPCODE_ADD);
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.dest_reg, 1);
|
||||
EXPECT_EQ(OPDEST{ .HEX = instructions[1].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = instructions[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(instructions[1].length, 4);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterAnnotationPass)
|
||||
{
|
||||
// Code snippet reads from R0, R1 and H4, clobbers R1, H0
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
MOV H0, H4;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 2);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
|
||||
annotation_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.clobber_list.size(), 2);
|
||||
ASSERT_EQ(block.input_list.size(), 3);
|
||||
|
||||
EXPECT_EQ(block.clobber_list[0].reg, H0);
|
||||
EXPECT_EQ(block.clobber_list[1].reg, R1);
|
||||
|
||||
EXPECT_EQ(block.input_list[0].reg, H4);
|
||||
EXPECT_EQ(block.input_list[1].reg, R0);
|
||||
EXPECT_EQ(block.input_list[2].reg, R1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterAnnotationPass_MixedIO)
|
||||
{
|
||||
// Code snippet reads from R0, R1, clobbers R0, R1, H0.
|
||||
// The H2 read does not count because R1 is clobbered.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
PK8U R0, R1;
|
||||
MOV H0, H2;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
|
||||
annotation_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.clobber_list.size(), 3);
|
||||
ASSERT_EQ(block.input_list.size(), 2);
|
||||
|
||||
EXPECT_EQ(block.clobber_list[0].reg, H0);
|
||||
EXPECT_EQ(block.clobber_list[1].reg, R0);
|
||||
EXPECT_EQ(block.clobber_list[2].reg, R1);
|
||||
|
||||
EXPECT_EQ(block.input_list[0].reg, R0);
|
||||
EXPECT_EQ(block.input_list[1].reg, R1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Simple16)
|
||||
{
|
||||
// Instruction 2 clobers R0 which in turn clobbers H0.
|
||||
// Instruction 3 reads from H0 so a barrier16 is needed between them.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
PK8U R0, R1;
|
||||
MOV H2, H0;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.instructions.size(), 5);
|
||||
|
||||
// H0.xy = unpackHalf2(r0.x);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 1);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0);
|
||||
|
||||
// H0.zw = unpackHalf2(r0.y);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Simple32)
|
||||
{
|
||||
// Instruction 2 clobers H1 which in turn clobbers R0.
|
||||
// Instruction 3 reads from R0 so a barrier32 is needed between them.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
MOV H1, R1
|
||||
MOV R2, R0;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.instructions.size(), 5);
|
||||
|
||||
// R0.z = packHalf2(H1.xy);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.dest_reg, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_y, 1);
|
||||
|
||||
// R0.w = packHalf2(H1.zw);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.dest_reg, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 2);
|
||||
EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_y, 3);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Complex_IF_BothPredecessorsClobber)
|
||||
{
|
||||
// Multi-level but only single IF
|
||||
// Mockup of a simple lighting function, R0 = Light vector, R1 = Decompressed normal. DP4 used for simplicity.
|
||||
// Data hazards sprinkled in for testing. R3 is clobbered in the ancestor and the IF branch.
|
||||
// Barrier should go in the IF branch here.
|
||||
auto ir = FPIR::from_source(R"(
|
||||
DP4 R2, R0, R1
|
||||
SFL R3
|
||||
SGT R3, R2, R0
|
||||
IF.GE
|
||||
ADD R0, R0, R2
|
||||
MOV H6, #{ 0.25 }
|
||||
ENDIF
|
||||
ADD R0, R0, R3
|
||||
MOV R1, R0
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
auto bb0 = get_graph_block(graph, 0);
|
||||
auto bb1 = get_graph_block(graph, 1);
|
||||
auto bb2 = get_graph_block(graph, 2);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(bb0->instructions.size(), 4);
|
||||
ASSERT_EQ(bb1->instructions.size(), 2);
|
||||
ASSERT_EQ(bb2->instructions.size(), 2);
|
||||
|
||||
// bb1 has a epilogue
|
||||
ASSERT_EQ(bb1->epilogue.size(), 2);
|
||||
|
||||
// bb1 epilogue updates R3.xy
|
||||
|
||||
// R3.x = packHalf2(H6.xy)
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.dest_reg, 3);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_x, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_y, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.tmp_reg_index, 6);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_x, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_y, 1);
|
||||
|
||||
// R3.y = packHalf2(H6.zw)
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.dest_reg, 3);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_x, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_y, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.tmp_reg_index, 6);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_x, 2);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_y, 3);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_OneBranchClobbers)
|
||||
{
|
||||
// Single IF-ELSE, if clobbers, ELSE does not
|
||||
auto ir = FPIR::from_source(R"(
|
||||
DP4 R2, R0, R1
|
||||
SFL R3
|
||||
SGT R3, R2, R0
|
||||
IF.GE
|
||||
ADD R0, R0, R2
|
||||
MOV H6, #{ 0.25 }
|
||||
ELSE
|
||||
ADD R0, R0, R1
|
||||
ENDIF
|
||||
ADD R0, R0, R3
|
||||
MOV R1, R0
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 4);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
const BasicBlock
|
||||
*bb0 = get_graph_block(graph, 0),
|
||||
*bb1 = get_graph_block(graph, 1),
|
||||
*bb2 = get_graph_block(graph, 2),
|
||||
*bb3 = get_graph_block(graph, 3);
|
||||
|
||||
ASSERT_EQ(bb0->instructions.size(), 4);
|
||||
ASSERT_EQ(bb1->instructions.size(), 2);
|
||||
ASSERT_EQ(bb2->instructions.size(), 1);
|
||||
ASSERT_EQ(bb3->instructions.size(), 2);
|
||||
|
||||
// bb1 has a epilogue
|
||||
ASSERT_EQ(bb0->epilogue.size(), 0);
|
||||
ASSERT_EQ(bb1->epilogue.size(), 2);
|
||||
ASSERT_EQ(bb2->epilogue.size(), 0);
|
||||
|
||||
// bb1 epilogue updates R3.xy
|
||||
|
||||
// R3.x = packHalf2(H6.xy)
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.dest_reg, 3);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_x, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_y, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.tmp_reg_index, 6);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_x, 0);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_y, 1);
|
||||
|
||||
// R3.y = packHalf2(H6.zw)
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.fp16, 0);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.dest_reg, 3);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_x, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_y, true);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_z, false);
|
||||
EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_w, false);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.tmp_reg_index, 6);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.fp16, 1);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_x, 2);
|
||||
EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_y, 3);
|
||||
}
|
||||
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_Simpsons)
|
||||
{
|
||||
// Complex IF-ELSE nest observed in Simpson's game. Rewritten for simplicity.
|
||||
// There is no tail block. No epilogues should be injected in this scenario since H4 (the trigger) is defined on all branches.
|
||||
// R2 is indeed clobbered but the outer ELSE branch should not be able to see the inner IF-ELSE blocks as predecessors.
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV R2, #{ 0.25 };
|
||||
IF.GT;
|
||||
SLT R4, H2, #{ 0.125 };
|
||||
IF.GT;
|
||||
ADD H2, H0, H3;
|
||||
FMA H4, R2, H2, H3;
|
||||
ELSE;
|
||||
MOV H2, #{ 0.125 };
|
||||
ADD H0, H0, H2;
|
||||
FMA H4, R2, H2, H3;
|
||||
ENDIF;
|
||||
ELSE;
|
||||
FMA H4, R2, H2, H3;
|
||||
MOV H0, H4;
|
||||
ENDIF;
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 6);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
const BasicBlock
|
||||
*bb0 = get_graph_block(graph, 0),
|
||||
*bb1 = get_graph_block(graph, 1),
|
||||
*bb2 = get_graph_block(graph, 2),
|
||||
*bb3 = get_graph_block(graph, 3),
|
||||
*bb4 = get_graph_block(graph, 4),
|
||||
*bb5 = get_graph_block(graph, 5);
|
||||
|
||||
// Sanity
|
||||
EXPECT_EQ(bb0->instructions.size(), 2);
|
||||
EXPECT_EQ(bb1->instructions.size(), 2);
|
||||
EXPECT_EQ(bb2->instructions.size(), 2);
|
||||
EXPECT_EQ(bb3->instructions.size(), 3);
|
||||
EXPECT_EQ(bb4->instructions.size(), 2);
|
||||
EXPECT_EQ(bb5->instructions.size(), 0); // Phi/Merge only.
|
||||
|
||||
// Nested children must recursively fall out to the closest ENDIF
|
||||
ASSERT_EQ(bb4->pred.size(), 1);
|
||||
EXPECT_EQ(bb4->pred.front().type, EdgeType::ELSE);
|
||||
EXPECT_EQ(bb5->pred.size(), 4); // 2 IF and 2 ELSE paths exist
|
||||
|
||||
// Check that we get no epilogues
|
||||
EXPECT_EQ(bb0->epilogue.size(), 0);
|
||||
EXPECT_EQ(bb1->epilogue.size(), 0);
|
||||
EXPECT_EQ(bb2->epilogue.size(), 0);
|
||||
EXPECT_EQ(bb3->epilogue.size(), 0);
|
||||
EXPECT_EQ(bb4->epilogue.size(), 0);
|
||||
EXPECT_EQ(bb5->epilogue.size(), 0);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Partial32_0)
|
||||
{
|
||||
// Instruction 2 partially clobers H1 which in turn clobbers R0.
|
||||
// Instruction 3 reads from R0 so a partial barrier32 is needed between them.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
MOV H1.x, R1.x;
|
||||
MOV R2, R0;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.instructions.size(), 4);
|
||||
|
||||
OPDEST dst{ .HEX = block.instructions[2].bytecode[0] };
|
||||
SRC0 src0{ .HEX = block.instructions[2].bytecode[1] };
|
||||
SRC1 src1{ .HEX = block.instructions[2].bytecode[2] };
|
||||
|
||||
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
|
||||
|
||||
// R0.z = packHalf2(H1.xy);
|
||||
EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_LO);
|
||||
EXPECT_EQ(dst.fp16, 0);
|
||||
EXPECT_EQ(dst.dest_reg, 0);
|
||||
EXPECT_EQ(dst.mask_x, false);
|
||||
EXPECT_EQ(dst.mask_y, false);
|
||||
EXPECT_EQ(dst.mask_z, true);
|
||||
EXPECT_EQ(dst.mask_w, false);
|
||||
EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(src0.tmp_reg_index, 0);
|
||||
EXPECT_EQ(src0.fp16, 0);
|
||||
EXPECT_EQ(src0.swizzle_x, 2);
|
||||
EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(src1.tmp_reg_index, 1);
|
||||
EXPECT_EQ(src1.fp16, 1);
|
||||
EXPECT_EQ(src1.swizzle_x, 0);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Partial32_1)
|
||||
{
|
||||
// Instruction 2 partially clobers H1 which in turn clobbers R0.
|
||||
// Instruction 3 reads from R0 so a partial barrier32 is needed between them.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
MOV H1.y, R1.y;
|
||||
MOV R2, R0;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
ASSERT_EQ(block.instructions.size(), 4);
|
||||
|
||||
OPDEST dst{ .HEX = block.instructions[2].bytecode[0] };
|
||||
SRC0 src0{ .HEX = block.instructions[2].bytecode[1] };
|
||||
SRC1 src1{ .HEX = block.instructions[2].bytecode[2] };
|
||||
|
||||
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
|
||||
|
||||
// R0.z = packHalf2(H1.xy);
|
||||
EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_HI);
|
||||
EXPECT_EQ(dst.fp16, 0);
|
||||
EXPECT_EQ(dst.dest_reg, 0);
|
||||
EXPECT_EQ(dst.mask_x, false);
|
||||
EXPECT_EQ(dst.mask_y, false);
|
||||
EXPECT_EQ(dst.mask_z, true);
|
||||
EXPECT_EQ(dst.mask_w, false);
|
||||
EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(src0.tmp_reg_index, 0);
|
||||
EXPECT_EQ(src0.fp16, 0);
|
||||
EXPECT_EQ(src0.swizzle_x, 2);
|
||||
EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
|
||||
EXPECT_EQ(src1.tmp_reg_index, 1);
|
||||
EXPECT_EQ(src1.fp16, 1);
|
||||
EXPECT_EQ(src1.swizzle_x, 1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_SkipDelaySlots)
|
||||
{
|
||||
// Instruction 2 clobers H1 which in turn clobbers R0.
|
||||
// Instruction 3 reads from R0 but is a delay slot that does nothing and can be NOPed.
|
||||
auto graph = CFG_from_source(R"(
|
||||
ADD R1, R0, R1;
|
||||
MOV H1, R1
|
||||
MOV R0, R0;
|
||||
)");
|
||||
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
|
||||
|
||||
auto& block = graph.blocks.front();
|
||||
RSXFragmentProgram prog{};
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog, { .skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
// Delay slot detection will cause no dependency injection
|
||||
ASSERT_EQ(block.instructions.size(), 3);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Skip_IF_ELSE_Ancestors)
|
||||
{
|
||||
// R4/H8 is clobbered but an IF-ELSE chain follows it.
|
||||
// Merge block reads H8, but since both IF-ELSE legs resolve the dependency, we do not need a barrier for H8.
|
||||
// H6 is included as a control.
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV R4, #{ 0.25 }
|
||||
MOV H6.x, #{ 0.125 }
|
||||
IF.LT
|
||||
MOV H8, #{ 0.0 }
|
||||
ELSE
|
||||
MOV H8, #{ 0.25 }
|
||||
ENDIF
|
||||
ADD R0, R3, H8
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
// Verify state before
|
||||
ASSERT_EQ(graph.blocks.size(), 4);
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 3);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
// We get one barrier on R3 (H6) but nont for R4 (H8)
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 1);
|
||||
|
||||
// No intra-block barriers
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 3);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Process_IF_Ancestors)
|
||||
{
|
||||
// H8.x is clobbered but only an IF sequence follows with no ELSE.
|
||||
// Merge block reads r4.x, but since both IF-ELSE legs resolve the dependency, we do not need a barrier.
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV H8.x, #{ 0.25 }
|
||||
IF.LT
|
||||
MOV R4.x, #{ 0.0 }
|
||||
ENDIF
|
||||
MOV R0, R4
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
// Verify state before
|
||||
ASSERT_EQ(graph.blocks.size(), 3);
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
// A barrier will be inserted into block 0 epilogue
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 1);
|
||||
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->epilogue.size(), 0);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->epilogue.size(), 0);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_Complex_IF_ELSE_Ancestor_Clobber)
|
||||
{
|
||||
// 2 clobbered registers up the chain.
|
||||
// 1 full barrier is needed for R4 (4 instructions)
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV R4, #{ 0.0 }
|
||||
IF.LT
|
||||
MOV H9, #{ 0.25 }
|
||||
ENDIF
|
||||
MOV H8, #{ 0.25 }
|
||||
IF.LT
|
||||
IF.GT
|
||||
ADD R0, R0, R0
|
||||
ELSE
|
||||
ADD R0, R1, R0
|
||||
ENDIF
|
||||
ENDIF
|
||||
ADD R0, R0, R4
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
// Verify state before
|
||||
ASSERT_EQ(graph.blocks.size(), 7);
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 4)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 5)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 6)->instructions.size(), 1);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
// Full-lane barrier on writing blocks
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->epilogue.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->epilogue.size(), 2);
|
||||
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 1)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 2)->instructions.size(), 2);
|
||||
EXPECT_EQ(get_graph_block(graph, 3)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 4)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 5)->instructions.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 6)->instructions.size(), 1);
|
||||
}
|
||||
|
||||
TEST(TestFPIR, RegisterDependencyPass_SplinterCell_DelaySlot)
|
||||
{
|
||||
// Real shader pattern found in splinter cell blacklist.
|
||||
// TEX instructions replaced with MOV for simplicity.
|
||||
// There are no dependent reads here, no barriers are expected.
|
||||
// In the game, instruction 4 was misclassified as a delay slot, causing a skipped clobber.
|
||||
auto ir = FPIR::from_source(R"(
|
||||
MOV R0.w, #{ 0.25 }
|
||||
MOV H0, H8
|
||||
MUL R0.w, H0.w, R0.w
|
||||
MOV R0.xyz, H0.xyz
|
||||
MOV R1, #{ 0.25 }
|
||||
FMA H0, R0, #{ 0.125 }, R1
|
||||
)");
|
||||
|
||||
auto bytecode = ir.compile();
|
||||
RSXFragmentProgram prog{};
|
||||
prog.data = bytecode.data();
|
||||
auto graph = deconstruct_fragment_program(prog);
|
||||
|
||||
// Verify state before
|
||||
ASSERT_EQ(graph.blocks.size(), 1);
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 6);
|
||||
|
||||
FP::RegisterAnnotationPass annotation_pass{ prog, {.skip_delay_slots = true } };
|
||||
FP::RegisterDependencyPass deps_pass{};
|
||||
|
||||
annotation_pass.run(graph);
|
||||
deps_pass.run(graph);
|
||||
|
||||
// Verify state after
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->instructions.size(), 6);
|
||||
EXPECT_EQ(get_graph_block(graph, 0)->epilogue.size(), 0);
|
||||
}
|
||||
}
|
||||
@ -324,6 +324,40 @@ namespace rsx
|
||||
EXPECT_EQ(arr.find_if(FN(x == 99)), nullptr);
|
||||
}
|
||||
|
||||
TEST(SimpleArray, InsertArray)
|
||||
{
|
||||
rsx::simple_array<int> arr{
|
||||
0, 1, 2, 6, 7, 8, 9
|
||||
};
|
||||
|
||||
const std::vector<int> tail{
|
||||
10, 11, 12
|
||||
};
|
||||
|
||||
const std::vector<int> mid{
|
||||
3, 4, 5
|
||||
};
|
||||
|
||||
// Insert end
|
||||
arr.insert(arr.end(), tail);
|
||||
EXPECT_EQ(arr.size(), 10);
|
||||
|
||||
// Insert mid
|
||||
auto it = arr.begin();
|
||||
std::advance(it, 3);
|
||||
it = arr.insert(it, mid);
|
||||
|
||||
EXPECT_EQ(arr.size(), 13);
|
||||
EXPECT_EQ(std::distance(arr.begin(), it), 3);
|
||||
EXPECT_EQ(*it, 3);
|
||||
|
||||
// Verify
|
||||
for (unsigned i = 0; i < arr.size(); ++i)
|
||||
{
|
||||
EXPECT_EQ(arr[i], static_cast<int>(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AlignedAllocator, Alloc)
|
||||
{
|
||||
auto ptr = rsx::aligned_allocator::malloc<256>(16);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user