diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp index cddc562fa5..4be6acb55b 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/FPASM.cpp @@ -263,13 +263,26 @@ namespace rsx::assembler { ensure(reg.length() > 1, "Invalid register specifier"); - const auto index = std::stoi(reg.substr(1)); + const auto parts = fmt::split(reg, { "." }); + ensure(parts.size() > 0 && parts.size() <= 2); + + const auto index = std::stoi(parts[0].substr(1)); RegisterRef ref { .reg { .id = index, .f16 = false }, .mask = 0x0F }; + if (parts.size() > 1 && parts[1].length() > 0) + { + // FIXME: No swizzles for now, just lane masking + ref.mask = 0; + if (parts[1].find("x") != std::string::npos) ref.mask |= (1u << 0); + if (parts[1].find("y") != std::string::npos) ref.mask |= (1u << 1); + if (parts[1].find("z") != std::string::npos) ref.mask |= (1u << 2); + if (parts[1].find("w") != std::string::npos) ref.mask |= (1u << 3); + } + if (reg[0] == 'H' || reg[0] == 'h') { ref.reg.f16 = true; @@ -325,7 +338,7 @@ namespace rsx::assembler do { \ inst->opcode = encoding.op; \ d0.opcode = encoding.op & 0x3F; \ - s1.opcode_is_branch = (encoding.op > 0x3F)? 1 : 0; \ + s1.opcode_hi = (encoding.op > 0x3F)? 1 : 0; \ s0.exec_if_eq = encoding.exec_if_eq ? 1 : 0; \ s0.exec_if_gr = encoding.exec_if_gt ? 1 : 0; \ s0.exec_if_lt = encoding.exec_if_lt ? 1 : 0; \ diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h index b2297a24ab..4e7f65f22b 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h +++ b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h @@ -75,7 +75,12 @@ namespace rsx::assembler RSX_FP_OPCODE_IFE = 0x42, // If RSX_FP_OPCODE_LOOP = 0x43, // Loop RSX_FP_OPCODE_REP = 0x44, // Repeat - RSX_FP_OPCODE_RET = 0x45 // Return + RSX_FP_OPCODE_RET = 0x45, // Return + + + // Custom opcodes for dependency injection + RSX_FP_OPCODE_OR16_LO = 0x46, // Performs a 16-bit OR, taking one register channel as input and overwriting low 16 bits of the output + RSX_FP_OPCODE_OR16_HI = 0x47, // Same as the lo variant but now overwrites the high 16-bit block }; namespace FP diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp index 82f5464a0a..577252fd83 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp @@ -159,7 +159,7 @@ namespace rsx::assembler src2.HEX = decoded._u32[3]; end = !!dst.end; - const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6); + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); if (opcode == RSX_FP_OPCODE_NOP) { diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp index b7e3dc2116..4f8483d91d 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp @@ -22,6 +22,20 @@ namespace rsx::assembler::FP std::unordered_map sync_register_map; }; + enum Register32BarrierFlags + { + NONE = 0, + OR_WORD0 = 1, + OR_WORD1 = 2, + DEFAULT = OR_WORD0 | OR_WORD1 + }; + + struct RegisterBarrier32 + { + RegisterRef ref; + u32 flags[4]; + }; + std::vector decode_lanes16(const std::unordered_set& lanes) { std::vector result; @@ -47,34 +61,45 @@ namespace rsx::assembler::FP return result; } - std::vector decode_lanes32(const std::unordered_set& lanes) + std::vector decode_lanes32(const std::unordered_set& lanes) { - std::vector result; + std::vector result; for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 16) { // Each register has 8 16-bit lanes + RegisterBarrier32 barrier{}; + auto& ref = barrier.ref; - u32 mask = 0; - if (lanes.contains(file_offset + 0) || lanes.contains(file_offset + 2)) mask |= (1u << 0); - if (lanes.contains(file_offset + 4) || lanes.contains(file_offset + 6)) mask |= (1u << 1); - if (lanes.contains(file_offset + 8) || lanes.contains(file_offset + 10)) mask |= (1u << 2); - if (lanes.contains(file_offset + 12) || lanes.contains(file_offset + 14)) mask |= (1u << 3); + for (u32 lane = 0; lane < 16; lane += 2) + { + if (!lanes.contains(file_offset + lane)) + { + continue; + } - if (mask == 0) + const u32 ch = (lane / 4); + const u32 flags = (lane & 3) + ? Register32BarrierFlags::OR_WORD1 + : Register32BarrierFlags::OR_WORD0; + + ref.mask |= (1u << ch); + barrier.flags[ch] |= flags; + } + + if (ref.mask == 0) { continue; } - RegisterRef ref{ .reg{.id = static_cast(index), .f16 = false } }; - ref.mask = mask; - result.push_back(ref); + ref.reg = {.id = static_cast(index), .f16 = false }; + result.push_back(barrier); } return result; } - std::vector build_barrier32(const RegisterRef& reg) + std::vector build_barrier32(const RegisterBarrier32& barrier) { // Upto 4 instructions are needed per 32-bit register // R0.x = packHalf2x16(H0.xy) @@ -84,28 +109,27 @@ namespace rsx::assembler::FP std::vector result; - for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch) + for (u32 mask = barrier.ref.mask, ch = 0; mask > 0; mask >>= 1, ++ch) { if (!(mask & 1)) { continue; } + const auto& reg = barrier.ref.reg; + const auto reg_id = reg.id; + Instruction instruction{}; OPDEST dst{}; - dst.opcode = RSX_FP_OPCODE_PK2; dst.prec = RSX_FP_PRECISION_REAL; dst.fp16 = 0; - dst.dest_reg = reg.reg.id; + dst.dest_reg = reg_id; dst.write_mask = (1u << ch); - const u32 src_reg_id = (ch / 2) + (reg.reg.id * 2); + const u32 src_reg_id = (ch / 2) + (reg_id * 2); const bool is_word0 = !(ch & 1); // Only even SRC0 src0{}; - src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; - src0.fp16 = 1; - if (is_word0) { src0.swizzle_x = 0; @@ -121,14 +145,50 @@ namespace rsx::assembler::FP src0.swizzle_w = 3; src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP; src0.tmp_reg_index = src_reg_id; + src0.fp16 = 1; - instruction.opcode = dst.opcode; + // Prepare source 1 to match the output in case we need to encode an OR + SRC1 src1{}; + src1.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src1.tmp_reg_index = reg_id; + src1.swizzle_x = ch; + src1.swizzle_y = ch; + src1.swizzle_z = ch; + src1.swizzle_w = ch; + + u32 opcode = 0; + switch (barrier.flags[ch]) + { + case Register32BarrierFlags::DEFAULT: + opcode = RSX_FP_OPCODE_PK2; + break; + case Register32BarrierFlags::OR_WORD0: + opcode = RSX_FP_OPCODE_OR16_LO; + // Swap inputs + std::swap(src0.HEX, src1.HEX); + break; + case Register32BarrierFlags::OR_WORD1: + opcode = RSX_FP_OPCODE_OR16_HI; + src0.swizzle_x = src0.swizzle_y; + std::swap(src0.HEX, src1.HEX); + break; + case Register32BarrierFlags::NONE: + default: + fmt::throw_exception("Unexpected lane barrier with no mask."); + } + + dst.opcode = opcode & 0x3F; + src1.opcode_hi = (opcode > 0x3F) ? 1 : 0; + src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; + + instruction.opcode = opcode; instruction.bytecode[0] = dst.HEX; instruction.bytecode[1] = src0.HEX; + instruction.bytecode[2] = src1.HEX; Register src_reg{ .id = static_cast(src_reg_id), .f16 = true }; - instruction.srcs.push_back({ .reg=src_reg, .mask=0xF }); - instruction.dsts.push_back({ .reg{ .id = reg.reg.id, .f16 = false }, .mask = (1u << ch) }); + instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF }); + instruction.dsts.push_back({ .reg{ .id = reg_id, .f16 = false }, .mask = (1u << ch) }); result.push_back(instruction); } @@ -207,10 +267,22 @@ namespace rsx::assembler::FP { std::vector result; - const auto regs = (f16 ? decode_lanes16 : decode_lanes32)(lanes); - for (const auto& ref : regs) + if (f16) { - auto instructions = (f16 ? build_barrier16 : build_barrier32)(ref); + const auto regs = decode_lanes16(lanes); + for (const auto& ref : regs) + { + auto instructions = build_barrier16(ref); + result.insert(result.end(), instructions.begin(), instructions.end()); + } + + return result; + } + + const auto barriers = decode_lanes32(lanes); + for (const auto& barrier : barriers) + { + auto instructions = build_barrier32(barrier); result.insert(result.end(), instructions.begin(), instructions.end()); } diff --git a/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp b/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp index a06818de10..1dfe83e468 100644 --- a/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/Program/CgBinaryFragmentProgram.cpp @@ -273,7 +273,7 @@ void CgBinaryDisasm::TaskFP() src2.HEX = GetData(data[3]); m_step = 4 * sizeof(u32); - m_opcode = dst.opcode | (src1.opcode_is_branch << 6); + m_opcode = dst.opcode | (src1.opcode_hi << 6); auto SCT = [&]() { diff --git a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h index e20098ff57..d93ec760e6 100644 --- a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h +++ b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h @@ -102,7 +102,7 @@ union SRC1 u32 src1_prec_mod : 3; // Precision modifier for src1 (CoD:MW series) u32 src2_prec_mod : 3; // Precision modifier for src2 (unproven, should affect MAD instruction) u32 scale : 3; - u32 opcode_is_branch : 1; + u32 opcode_hi : 1; // Opcode high bit }; struct diff --git a/rpcs3/tests/test_rsx_fp_asm.cpp b/rpcs3/tests/test_rsx_fp_asm.cpp index 752baf0776..be4f19abc6 100644 --- a/rpcs3/tests/test_rsx_fp_asm.cpp +++ b/rpcs3/tests/test_rsx_fp_asm.cpp @@ -472,4 +472,100 @@ namespace rsx::assembler EXPECT_EQ(bb4->epilogue.size(), 0); EXPECT_EQ(bb5->epilogue.size(), 0); } + + TEST(TestFPIR, RegisterDependencyPass_Partial32_0) + { + // Instruction 2 partially clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a partial barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1.x, R1.x; + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 4); + + OPDEST dst{ .HEX = block.instructions[2].bytecode[0] }; + SRC0 src0{ .HEX = block.instructions[2].bytecode[1] }; + SRC1 src1{ .HEX = block.instructions[2].bytecode[2] }; + + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_LO); + EXPECT_EQ(dst.fp16, 0); + EXPECT_EQ(dst.dest_reg, 0); + EXPECT_EQ(dst.mask_x, false); + EXPECT_EQ(dst.mask_y, false); + EXPECT_EQ(dst.mask_z, true); + EXPECT_EQ(dst.mask_w, false); + EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src0.tmp_reg_index, 0); + EXPECT_EQ(src0.fp16, 0); + EXPECT_EQ(src0.swizzle_x, 2); + EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src1.tmp_reg_index, 1); + EXPECT_EQ(src1.fp16, 1); + EXPECT_EQ(src1.swizzle_x, 0); + } + + TEST(TestFPIR, RegisterDependencyPass_Partial32_1) + { + // Instruction 2 partially clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a partial barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1.y, R1.y; + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 4); + + OPDEST dst{ .HEX = block.instructions[2].bytecode[0] }; + SRC0 src0{ .HEX = block.instructions[2].bytecode[1] }; + SRC1 src1{ .HEX = block.instructions[2].bytecode[2] }; + + const u32 opcode = dst.opcode | (src1.opcode_hi << 6); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_HI); + EXPECT_EQ(dst.fp16, 0); + EXPECT_EQ(dst.dest_reg, 0); + EXPECT_EQ(dst.mask_x, false); + EXPECT_EQ(dst.mask_y, false); + EXPECT_EQ(dst.mask_z, true); + EXPECT_EQ(dst.mask_w, false); + EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src0.tmp_reg_index, 0); + EXPECT_EQ(src0.fp16, 0); + EXPECT_EQ(src0.swizzle_x, 2); + EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(src1.tmp_reg_index, 1); + EXPECT_EQ(src1.fp16, 1); + EXPECT_EQ(src1.swizzle_x, 1); + } }