PPCAsm: Reloc modifiers should behave more like LLVM/gnu assembler

This commit is contained in:
Exzap 2026-04-25 23:38:52 +02:00
parent b2a735e0e2
commit d92edefafb
4 changed files with 131 additions and 494 deletions

View File

@ -49,30 +49,6 @@ bool GraphicPack2::ResolvePresetConstant(const std::string& varname, double& val
return false;
}
template<typename T>
T _expressionFuncHA(T input)
{
uint32 u32 = (uint32)input;
u32 = (((u32 >> 16) + ((u32 & 0x8000) ? 1 : 0)) & 0xffff);
return (T)u32;
}
template<typename T>
T _expressionFuncHI(T input)
{
uint32 u32 = (uint32)input;
u32 = (u32 >> 16) & 0xffff;
return (T)u32;
}
template<typename T>
T _expressionFuncLO(T input)
{
uint32 u32 = (uint32)input;
u32 &= 0xffff;
return (T)u32;
}
template<typename T>
T _expressionFuncReloc(T input)
{
@ -89,18 +65,7 @@ T _expressionFuncReloc(T input)
double _cbResolveConstant(std::string_view varname)
{
std::string varnameOnly;
std::string tokenOnly;
// detect suffix
bool hasSuffix = false;
const auto idx = varname.find('@');
if (idx != std::string_view::npos)
{
hasSuffix = true;
varnameOnly = varname.substr(0, idx);
tokenOnly = varname.substr(idx + 1);
}
else
varnameOnly = varname;
varnameOnly = varname;
double value;
if (varnameOnly.length() >= 1 && varnameOnly[0] == '$')
@ -172,37 +137,6 @@ double _cbResolveConstant(std::string_view varname)
}
value = v->second;
}
if (hasSuffix)
{
std::transform(tokenOnly.cbegin(), tokenOnly.cend(), tokenOnly.begin(), tolower);
if (tokenOnly == "ha")
{
value = _expressionFuncHA<double>(value);
}
else if (tokenOnly == "h" || tokenOnly == "hi")
{
value = _expressionFuncHI<double>(value);
}
else if (tokenOnly == "l" || tokenOnly == "lo")
{
value = _expressionFuncLO<double>(value);
}
else
{
// we treat unknown suffixes as unresolveable symbols
resolverState.hasUnknownVariable = true;
if (resolverState.captureUnresolvedSymbols)
{
std::string detailedSymbolName;
detailedSymbolName.assign(varnameOnly);
detailedSymbolName.append("@");
detailedSymbolName.append(tokenOnly);
detailedSymbolName.append(" (invalid suffix)");
resolverState.activePatchContext->unresolvedSymbols.emplace(resolverState.lineNumber, resolverState.currentGroup, detailedSymbolName);
}
return 0.0;
}
}
return value;
}
@ -212,11 +146,11 @@ double _cbResolveFunction(std::string_view funcname, double input)
std::transform(funcnameLC.cbegin(), funcnameLC.cend(), funcnameLC.begin(), tolower);
double value = input;
if (funcnameLC == "ha" || funcnameLC == "ha16")
value = _expressionFuncHA<double>(value);
value = TExpressionParser<double>::ExpressionFuncHA(value);
else if (funcnameLC == "hi" || funcnameLC == "hi16")
value = _expressionFuncHI<double>(value);
value = TExpressionParser<double>::ExpressionFuncHI(value);
else if (funcnameLC == "lo" || funcnameLC == "lo16")
value = _expressionFuncLO<double>(value);
value = TExpressionParser<double>::ExpressionFuncLO(value);
else if (funcnameLC == "reloc")
value = _expressionFuncReloc<double>(value);
else

View File

@ -27,7 +27,7 @@ class IntervalTree3
struct TreeNode
{
TRangeData values[NUM_SLOTS];
sint32 indices[NUM_SLOTS]; // for the second to last layer these are indices into value nodes vector. Otherwise its a relative offset to a tree node
sint32 indices[NUM_SLOTS]; // for the second to last layer these are indices into value nodes vector. Otherwise its a relative byte offset to a tree node
uint32 selfIndex{ INVALID_NODE_INDEX };
uint32 parentNodeIndex{ INVALID_NODE_INDEX };
uint8 parentSlot{ 0 };
@ -73,19 +73,17 @@ public:
void GetOverlappingRanges(TRangeData rangeBegin, TRangeData rangeEnd, std::vector<TNodeObject*>& results)//getRange(TRangeData rangeBegin, TRangeData rangeEnd)
{
results.clear();
cemu_assert_debug(rangeBegin < rangeEnd);
if (IsEmpty())
return;
results.clear();
if (rangeBegin >= rangeEnd)
return;
const ValueNode* valueNode = FindFloorValueNode(rangeBegin);
if (!valueNode)
valueNode = GetLeftmostValue();
if (rangeBegin < valueNode->rangeEnd && rangeEnd > valueNode->rangeBegin)
{
results.emplace_back(valueNode->ptr);
}
valueNode = GetSuccessorValue(valueNode);
while (valueNode != nullptr)
{
@ -95,18 +93,6 @@ public:
results.emplace_back(valueNode->ptr);
valueNode = GetSuccessorValue(valueNode);
}
// DEBUG - verify result by calculating it the slow way and comparing
// std::vector<TNodeObject*> manualTestList;
// std::map<TRangeData, TNodeObject*> manualTestListMap;
// for (auto& valNode : m_valueNodes)
// {
// if (valNode.rangeBegin < rangeEnd && valNode.rangeEnd > rangeBegin)
// manualTestListMap.emplace(valNode.rangeBegin, valNode.ptr);
// }
// for (auto& it : manualTestListMap)
// manualTestList.emplace_back(it.second);
//
// cemu_assert_debug(manualTestList == results);
}
// will assert if no exact match was found
@ -117,12 +103,6 @@ public:
cemu_assert(valueNode->rangeBegin == rangeBegin && valueNode->rangeEnd == rangeEnd);
TreeNode* parentNode = &m_treeNodes[valueNode->parentNodeIndex];
sint32 parentSlot = valueNode->parentSlot;
// [DEBUG] Validate tree
// {
// TRangeData treeMinVal, treeMaxVal;
// ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
// }
// remove value from parent
ReduceUsedCount(parentNode, 1);
for (sint32 i=parentSlot; i<parentNode->usedCount; i++)
@ -131,33 +111,15 @@ public:
PropagateMinValue(parentNode);
// release value
ReleaseValueNode(valueNode->selfIndex);
// if (parentNode->usedCount > 0)
// {
// // [DEBUG] Validate tree
// TRangeData treeMinVal, treeMaxVal;
// ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
// }
// static int dbgCounterA = 0;
// dbgCounterA++;
// if (dbgCounterA == 0x4d2f)
// __debugbreak();
// if parent node now has few nodes then merge/redistribute it
CollapseNode(parentNode, m_treeDepth-1);
sint32 dbg_prevTreeDepth = m_treeDepth;
ShortenTreeIfPossible();
// // [DEBUG] Validate tree
// TRangeData treeMinVal, treeMaxVal;
// ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
}
FORCE_INLINE TreeNode* GetTreeNodeChild(TreeNode* treeNode, sint32 slot)
{
cemu_assert_debug(slot >= 0 && slot < NUM_SLOTS);
//return &m_treeNodes[treeNode->indices[slot]];
// return treeNode + (treeNode->indices[slot] / (sint32)sizeof(TreeNode));
char* base = reinterpret_cast<char*>(treeNode);
return reinterpret_cast<TreeNode*>(base + treeNode->indices[slot]);
}
@ -165,13 +127,12 @@ public:
void SetChildNode(TreeNode& parent, sint32 slot, TreeNode* child)
{
cemu_assert_debug(slot >= 0 && slot < NUM_SLOTS);
cemu_assert_debug(child->usedCount > 0); // cant know value if node has no children
cemu_assert_debug(child->usedCount > 0); // cant determine value if node has no children
uint32 childIndex = child->selfIndex;
uint32 parentIndex = parent.selfIndex;
child->parentNodeIndex = parentIndex;
child->parentSlot = slot;
parent.values[slot] = child->values[0];
//parent.indices[slot] = childIndex;
parent.indices[slot] = (sint32)(child - &parent) * (sint32)sizeof(TreeNode);
}
@ -188,14 +149,6 @@ public:
void AddRange(TRangeData rangeBegin, TRangeData rangeEnd, TNodeObject* nodeObject)
{
// static uint32 addRangeDotCounter = 0;
// addRangeDotCounter++;
// if (addRangeDotCounter >= 500)
// {
// addRangeDotCounter = 0;
// WriteDotFile("IntervalTree3.dot");
// }
ReserveNodes();
cemu_assert_debug(rangeBegin < rangeEnd);
if (IsEmpty()) [[unlikely]]
@ -207,9 +160,6 @@ public:
rootNode.usedCount = 1;
SetChildNode(rootNode, 0, AllocateValueNode(rangeBegin, rangeEnd, nodeObject));
m_treeDepth = 1;
// [DEBUG] Validate tree
//TRangeData treeMinVal, treeMaxVal;
//ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
return;
}
// find the preceding value, we insert after it in the same parent
@ -239,16 +189,10 @@ public:
SetChildNode(*insertNode, insertSlotIndex, AllocateValueNode(rangeBegin, rangeEnd, nodeObject));
if (insertSlotIndex == 0)
PropagateMinValue(insertNode);
// [DEBUG] Validate tree
//TRangeData treeMinVal, treeMaxVal;
//ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
return;
}
// split is necessary
InsertNode(*insertNode, nullptr, &AllocateValueNode(rangeBegin, rangeEnd, nodeObject));
// [DEBUG] Validate tree
// TRangeData treeMinVal, treeMaxVal;
// ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
}
void WriteDotFile(const char* filePath)
@ -256,7 +200,6 @@ public:
std::ofstream outFile(filePath, std::ios::trunc);
if (!outFile.is_open())
return;
outFile << "digraph IntervalTree3 {\n";
outFile << " rankdir=TB;\n";
outFile << " splines=polyline;\n";
@ -277,7 +220,6 @@ public:
{
minChildValue = std::numeric_limits<TRangeData>::max();
maxChildValue = std::numeric_limits<TRangeData>::min();
// basic validation
cemu_assert(treeNode.usedCount > 0); // empty notes are not allowed
for (uint32 i=0; i<treeNode.usedCount-1; i++)
@ -333,9 +275,6 @@ public:
cemu_assert_debug(rangeBegin > nodeToInsertInto.values[0]); // if this is not true we are not allowed to move the child
MigrateSubnodesFromRight(&nodeToInsertInto, leftNeighbor, 1, !treeNode);
cemu_assert_debug(rangeBegin > leftNeighbor->values[leftNeighbor->usedCount-1]);
// [DEBUG] VALIDATE TREE
// TRangeData treeMinVal, treeMaxVal;
// ValidateTree(GetRootNode(), m_treeDepth, treeMinVal, treeMaxVal);
}
else
{
@ -384,7 +323,7 @@ public:
// target node is full and we need to split
if (nodeToInsertInto.parentNodeIndex == INVALID_NODE_INDEX)
{
// we do this by creating a new node between root and the current node and move all of root's children there
// splitting root requires creating a new tree node inbetween and increasing tree depth
// then we can split the new node like usual
TreeNode& newTreeNode = AllocateTreeNode(INVALID_NODE_INDEX, 0);
cemu_assert_debug(nodeToInsertInto.usedCount == NUM_SLOTS);
@ -403,7 +342,7 @@ public:
ReduceUsedCount(&nodeToInsertInto, nodeToInsertInto.usedCount - 1); // target count 1
SetChildNode(nodeToInsertInto, 0, &newTreeNode);
PropagateMinValue(&newTreeNode);
m_treeDepth++; // tree length increased
m_treeDepth++;
// try insert again but into the new non-root that can be split
InsertNode(newTreeNode, treeNode, valueNode);
return;
@ -526,7 +465,6 @@ private:
void ReserveNodes()
{
// this function guarantees a minimum of available amount of tree and value nodes in the pool, enough for the needs of AddRange()
// additionally, to avoid having to resize many times we allocate new elements in larger chunks
if (m_freeTreeNodeIndices.size() < 16)
{
uint32 curNodeCount = (uint32)m_treeNodes.size();
@ -969,68 +907,9 @@ private:
std::vector<uint32> m_freeTreeNodeIndices;
std::vector<ValueNode> m_valueNodes;
std::vector<uint32> m_freeValueNodeIndices;
sint32 m_treeDepth{0};
};
template<typename TRangeData, typename TNodeObject>
class IntervalTree3Map
{
public:
TNodeObject* GetRange(TRangeData rangeBegin)
{
auto itr = m_ranges.find(rangeBegin);
if (itr == m_ranges.cend())
return nullptr;
return itr->second;
}
void GetOverlappingRanges(TRangeData rangeBegin, TRangeData rangeEnd, std::vector<TNodeObject*>& results)
{
cemu_assert_debug(rangeBegin < rangeEnd);
results.clear();
if (m_ranges.empty() || rangeBegin >= rangeEnd)
return;
auto itr = m_ranges.lower_bound(rangeBegin);
while (itr != m_ranges.cend() && itr->first < rangeEnd)
{
results.emplace_back(itr->second);
++itr;
}
}
void AddRange(TRangeData rangeBegin, TRangeData rangeEnd, TNodeObject* nodeObject)
{
cemu_assert_debug(rangeBegin < rangeEnd);
cemu_assert_debug(rangeEnd == (rangeBegin + 1));
auto insertResult = m_ranges.emplace(rangeBegin, nodeObject);
cemu_assert(insertResult.second);
}
void RemoveRange(TRangeData rangeBegin, TRangeData rangeEnd)
{
cemu_assert_debug(rangeBegin < rangeEnd);
cemu_assert_debug(rangeEnd == (rangeBegin + 1));
auto itr = m_ranges.find(rangeBegin);
cemu_assert(itr != m_ranges.cend());
m_ranges.erase(itr);
}
bool IsEmpty() const
{
return m_ranges.empty();
}
void PrintStats()
{
cemuLog_log(LogType::Force, "--- IntervalTree3Map info ---");
cemuLog_log(LogType::Force, "NumValues: {}", m_ranges.size());
}
private:
std::map<TRangeData, TNodeObject*> m_ranges;
};
#endif
#ifndef NEW_INTERVAL_TREE
@ -1938,11 +1817,6 @@ public:
std::vector<uint32> BufferCacheNode::g_deallocateQueue;
static int dbg_sampleCounter = 0;
static sint64 dbg_totalSampleCount = 0;
static double dbg_totalSampleTime = 0;
static HRTick dbg_sampleTimeDiff = 0;
#ifdef NEW_INTERVAL_TREE
IntervalTree3<MPTR, BufferCacheNode> g_gpuBufferCache3;
std::vector<BufferCacheNode*> s_gpuCacheQueryResult; // keep vector around to reduce runtime allocations
@ -1956,37 +1830,15 @@ void LatteBufferCache_removeSingleNodeFromTree(BufferCacheNode* node)
#endif
}
__declspec(noinline) BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAddress, uint32 size)
BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAddress, uint32 size)
{
MPTR rangeStart = physAddress - (physAddress % CACHE_PAGE_SIZE);
MPTR rangeEnd = (physAddress + size + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
HRTick startTick = HighResolutionTimer::now().getTick();
BufferCacheNode* range = g_gpuBufferCache3.GetRange(physAddress);
HRTick endTick = HighResolutionTimer::now().getTick();
dbg_sampleTimeDiff += (endTick - startTick);
dbg_sampleCounter++;
if (dbg_sampleCounter > 100)
{
dbg_totalSampleCount += 100;
dbg_totalSampleTime += HighResolutionTimer::getTimeDiff(0, dbg_sampleTimeDiff);
dbg_sampleTimeDiff = 0;
dbg_sampleCounter = 0;
if ( (dbg_totalSampleCount%500000) == 0 )
{
cemuLog_log(LogType::Force, "---- Cache sample print ----");
cemuLog_log(LogType::Force, "TotalSamples: {}", dbg_totalSampleCount);
cemuLog_log(LogType::Force, "TotalSampleTime: {}", dbg_totalSampleTime);
cemuLog_log(LogType::Force, "Avg(microseconds): {}", dbg_totalSampleTime / (double)dbg_totalSampleCount * 1000000.0);
g_gpuBufferCache3.PrintStats();
}
}
if (range && physAddress >= range->GetRangeBegin() && (physAddress+size) <= range->GetRangeEnd())
{
return range;
}
// no containing range found, we need to create a range and potentially merge any overlapping ones
// no containing range found, we need to create a range and potentially merge with any overlapping ranges
g_gpuBufferCache3.GetOverlappingRanges(rangeStart, rangeEnd, s_gpuCacheQueryResult);
if (s_gpuCacheQueryResult.empty())
{
@ -2010,27 +1862,6 @@ __declspec(noinline) BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAdd
g_gpuBufferCache3.AddRange(mergedRangeStart, mergedRangeEnd, newRange);
return newRange;
}
// algorithm goes like this:
// 0) Try to fetch existing range and exit early if we find one
// 1) Get all overlapping nodes
// 2) If there are none then just add a new range
// 3) Otherwise calculate a new merged range size and merge data into it
// 4) Then remove the ranges and add the new one
return nullptr;
// auto range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
// if (!range)
// {
// g_gpuBufferCache.addRange(rangeStart, rangeEnd);
// range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
// cemu_assert_debug(range);
// }
// cemu_assert_debug(range->GetRangeBegin() <= physAddress);
// cemu_assert_debug(range->GetRangeEnd() >= (physAddress + size));
// return range;
}
#else
@ -2046,28 +1877,7 @@ BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAddress, uint32 size)
MPTR rangeStart = physAddress - (physAddress % CACHE_PAGE_SIZE);
MPTR rangeEnd = (physAddress + size + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
HRTick startTick = HighResolutionTimer::now().getTick();
//BufferCacheNode* range = g_gpuBufferCache3.GetRange(physAddress);
auto range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
HRTick endTick = HighResolutionTimer::now().getTick();
dbg_sampleTimeDiff += (endTick - startTick);
dbg_sampleCounter++;
if (dbg_sampleCounter > 100)
{
dbg_totalSampleCount += 100;
dbg_totalSampleTime += HighResolutionTimer::getTimeDiff(0, dbg_sampleTimeDiff);
dbg_sampleTimeDiff = 0;
dbg_sampleCounter = 0;
if ( (dbg_totalSampleCount%100000) == 0 )
{
cemuLog_log(LogType::Force, "---- Cache sample print ----");
cemuLog_log(LogType::Force, "TotalSamples: {}", dbg_totalSampleCount);
cemuLog_log(LogType::Force, "TotalSampleTime: {}", dbg_totalSampleTime);
cemuLog_log(LogType::Force, "Avg(microseconds): {}", dbg_totalSampleTime / (double)dbg_totalSampleCount * 1000000.0);
}
}
BufferCacheNode* range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
if (!range)
{
g_gpuBufferCache.addRange(rangeStart, rangeEnd);
@ -2150,8 +1960,6 @@ void LatteBufferCache_processDeallocations()
BufferCacheNode::ProcessDeallocations();
}
void LatteBufferCache_profileIntervalTree3();
void LatteBufferCache_init(size_t bufferSize)
{
#ifdef NEW_INTERVAL_TREE
@ -2161,10 +1969,6 @@ void LatteBufferCache_init(size_t bufferSize)
#endif
g_gpuBufferHeap.reset(new VHeap(nullptr, (uint32)bufferSize));
g_renderer->bufferCache_init((uint32)bufferSize);
// DEBUG
//LatteBufferCache_profileIntervalTree3();
//exit(0);
}
void LatteBufferCache_UnloadAll()
@ -2276,13 +2080,6 @@ void LatteBufferCache_notifySwapTVScanBuffer()
void LatteBufferCache_incrementalCleanup()
{
if (GetAsyncKeyState('A'))
{
g_gpuBufferCache3.PrintStats();
g_gpuBufferCache3.WriteDotFile("BotWTree.dot");
Sleep(2000);
}
static uint32 s_counter = 0;
if (s_allCacheNodes.empty())
@ -2358,212 +2155,3 @@ void LatteBufferCache_incrementalCleanup()
}
}
}
// ****************************************** Profiling code **********************************************/
void LatteBufferCache_profileIntervalTree3()
{
#ifdef NEW_INTERVAL_TREE
IntervalTree3<MPTR, void> profileTree;
//IntervalTree3Map<MPTR, void> profileTree;
void* dummyObject = reinterpret_cast<void*>(1);
// static constexpr sint32 kInsertPhaseCount[3] = { 12000, 12000, 10000 }; // total 34000
// static constexpr sint32 kTotalInsertCount = 34000;
// static constexpr sint32 kRemovePhaseCount[3] = { 8000, 8000, 8000 };
static constexpr sint32 kInsertPhaseCount[3] = { 400, 400, 200 }; // total 1000
static constexpr sint32 kTotalInsertCount = 1000;
static constexpr sint32 kRemovePhaseCount[3] = { 300, 300, 300 };
static constexpr sint32 kGetRangeQueryCountTotal = 1000000;
static constexpr sint32 kGetRangeHitQueryCount = kGetRangeQueryCountTotal / 2;
static constexpr sint32 kGetRangeMissQueryCount = kGetRangeQueryCountTotal - kGetRangeHitQueryCount;
static constexpr sint32 kOverlapQueryCount = 300000;
static constexpr uint32 kPointDomain = 65536; // fixed unique point pool
auto logPhase = [](const char* label, sint32 phaseIndex, sint32 opCount, double elapsedMs)
{
double mops = 0.0;
double nsPerOp = 0.0;
if (elapsedMs > 0.0 && opCount > 0)
{
mops = static_cast<double>(opCount) / (elapsedMs * 1000.0);
nsPerOp = (elapsedMs * 1000000.0) / static_cast<double>(opCount);
}
cemuLog_log(LogType::Force,
"[IntervalTree3 profile] {} {}: {} ops in {:.3f} ms ({:.3f} Mops/s, {:.1f} ns/op)",
label, phaseIndex + 1, opCount, elapsedMs, mops, nsPerOp);
};
auto logSingle = [](const char* label, sint32 opCount, double elapsedMs)
{
double mops = 0.0;
double nsPerOp = 0.0;
if (elapsedMs > 0.0 && opCount > 0)
{
mops = static_cast<double>(opCount) / (elapsedMs * 1000.0);
nsPerOp = (elapsedMs * 1000000.0) / static_cast<double>(opCount);
}
cemuLog_log(LogType::Force,
"[IntervalTree3 profile] {}: {} ops in {:.3f} ms ({:.3f} Mops/s, {:.1f} ns/op)",
label, opCount, elapsedMs, mops, nsPerOp);
};
uint32 randomState = 0xC001D00Du;
auto nextRandom = [&]() -> uint32
{
// xorshift32 (deterministic, cheap)
randomState ^= (randomState << 13);
randomState ^= (randomState >> 17);
randomState ^= (randomState << 5);
return randomState;
};
cemu_assert_debug(kTotalInsertCount <= static_cast<sint32>(kPointDomain));
std::vector<MPTR> pointPool;
pointPool.reserve(kPointDomain);
for (uint32 i = 0; i < kPointDomain; i++)
pointPool.emplace_back(static_cast<MPTR>(i));
for (sint32 i = static_cast<sint32>(pointPool.size()) - 1; i > 0; i--)
{
sint32 j = static_cast<sint32>(nextRandom() % static_cast<uint32>(i + 1));
MPTR tmpPoint = pointPool[i];
pointPool[i] = pointPool[j];
pointPool[j] = tmpPoint;
}
std::vector<MPTR> activePoints;
activePoints.reserve(kTotalInsertCount);
size_t pointPoolIndex = 0;
cemuLog_log(LogType::Force, "[IntervalTree3 profile] Starting profile run");
// insert phases
for (sint32 phase = 0; phase < 3; phase++)
{
const sint32 insertCount = kInsertPhaseCount[phase];
BenchmarkTimer timer;
timer.Start();
for (sint32 i = 0; i < insertCount; i++)
{
MPTR point = pointPool[pointPoolIndex];
pointPoolIndex++;
profileTree.AddRange(point, point + 1, dummyObject);
activePoints.emplace_back(point);
}
timer.Stop();
logPhase("Insert phase", phase, insertCount, timer.GetElapsedMilliseconds());
}
cemuLog_log(LogType::Force, "[IntervalTree3 profile] Active points after inserts: {}", activePoints.size());
cemu_assert_debug(pointPoolIndex == activePoints.size());
profileTree.PrintStats();
// GetRange() benchmark (existing points)
{
sint32 hitCount = 0;
BenchmarkTimer timer;
timer.Start();
for (sint32 repeat=0; repeat<100; repeat++)
{
for (sint32 i = 0; i < kGetRangeHitQueryCount; i++)
{
size_t idx = static_cast<size_t>(nextRandom() % static_cast<uint32>(activePoints.size()));
MPTR point = activePoints[idx];
if (profileTree.GetRange(point) != nullptr)
hitCount++;
}
}
timer.Stop();
double elapsedMs = timer.GetElapsedMilliseconds();
logSingle("GetRange hit phase", kGetRangeHitQueryCount, elapsedMs);
cemuLog_log(LogType::Force,
"[IntervalTree3 profile] GetRange hit phase: hits {} / {} ({:.2f}%)",
hitCount, kGetRangeHitQueryCount, (static_cast<double>(hitCount) * 100.0) / static_cast<double>(kGetRangeHitQueryCount));
}
// GetRange() benchmark (points guaranteed to be absent)
{
cemu_assert_debug(pointPoolIndex < pointPool.size());
size_t missPoolStart = pointPoolIndex;
size_t missPoolCount = pointPool.size() - missPoolStart;
sint32 missCount = 0;
BenchmarkTimer timer;
timer.Start();
for (sint32 i = 0; i < kGetRangeMissQueryCount; i++)
{
size_t idx = missPoolStart + (nextRandom() % static_cast<uint32>(missPoolCount));
MPTR point = pointPool[idx];
if (profileTree.GetRange(point) == nullptr)
missCount++;
}
timer.Stop();
double elapsedMs = timer.GetElapsedMilliseconds();
logSingle("GetRange miss phase", kGetRangeMissQueryCount, elapsedMs);
cemuLog_log(LogType::Force,
"[IntervalTree3 profile] GetRange miss phase: misses {} / {} ({:.2f}%)",
missCount, kGetRangeMissQueryCount, (static_cast<double>(missCount) * 100.0) / static_cast<double>(kGetRangeMissQueryCount));
}
// overlap query phase (separate)
{
std::vector<void*> overlapResults;
overlapResults.reserve(128);
uint64 overlapHitSum = 0;
BenchmarkTimer timer;
timer.Start();
for (sint32 i = 0; i < kOverlapQueryCount; i++)
{
MPTR begin = static_cast<MPTR>(nextRandom() % kPointDomain);
MPTR rangeLen = static_cast<MPTR>((nextRandom() & 63u) + 1u); // 1..64
MPTR end = begin + rangeLen;
profileTree.GetOverlappingRanges(begin, end, overlapResults);
overlapHitSum += static_cast<uint64>(overlapResults.size());
}
timer.Stop();
const double elapsedMs = timer.GetElapsedMilliseconds();
double mops = 0.0;
double nsPerOp = 0.0;
if (elapsedMs > 0.0)
{
mops = static_cast<double>(kOverlapQueryCount) / (elapsedMs * 1000.0);
nsPerOp = (elapsedMs * 1000000.0) / static_cast<double>(kOverlapQueryCount);
}
const double avgHits = static_cast<double>(overlapHitSum) / static_cast<double>(kOverlapQueryCount);
cemuLog_log(LogType::Force,
"[IntervalTree3 profile] Overlap query phase: {} ops in {:.3f} ms ({:.3f} Mops/s, {:.1f} ns/op, avg hits {:.2f})",
kOverlapQueryCount, elapsedMs, mops, nsPerOp, avgHits);
}
// remove phases (random removals)
for (sint32 phase = 0; phase < 3; phase++)
{
sint32 removeCount = kRemovePhaseCount[phase];
if (removeCount > static_cast<sint32>(activePoints.size()))
removeCount = static_cast<sint32>(activePoints.size());
BenchmarkTimer timer;
timer.Start();
for (sint32 i = 0; i < removeCount; i++)
{
size_t idx = static_cast<size_t>(nextRandom() % static_cast<uint32>(activePoints.size()));
MPTR point = activePoints[idx];
profileTree.RemoveRange(point, point + 1);
activePoints[idx] = activePoints.back();
activePoints.pop_back();
}
timer.Stop();
logPhase("Remove phase", phase, removeCount, timer.GetElapsedMilliseconds());
}
cemuLog_log(LogType::Force, "[IntervalTree3 profile] Active points after removals: {}", activePoints.size());
#else
cemuLog_log(LogType::Force, "[IntervalTree3 profile] NEW_INTERVAL_TREE is disabled");
#endif
}

View File

@ -43,4 +43,16 @@ void ExpressionParser_test()
cemu_assert_debug(_testEvaluateToType<float>("5 > 4 > 3 > 2") == 0.0f); // this should evaluate the operations from left to right, (5 > 4) -> 0.0, (0.0 > 4) -> 0.0, (0.0 > 3) -> 0.0, (0.0 > 2) -> 0.0
cemu_assert_debug(_testEvaluateToType<float>("5 > 4 > 3 > -2") == 1.0f); // this should evaluate the operations from left to right, (5 > 4) -> 0.0, (0.0 > 4) -> 0.0, (0.0 > 3) -> 0.0, (0.0 > -2) -> 1.0
cemu_assert_debug(_testEvaluateToType<float>("(5 == 5) > (5 == 6)") == 1.0f);
// reloc modifier behavior
ep = {};
ep.AddConstant("test", 5.0);
cemu_assert_debug(ep.IsValidExpression("test@ha") == true);
cemu_assert_debug(ep.IsValidExpression("test+15@lo") == true);
cemu_assert_debug(ep.IsValidExpression("test@ha + test@ha") == true); // technically not legal but we allow it for backwards compatibility (BotW extended memory pack relies on this)
cemu_assert_debug(ep.IsValidExpression("test@ha + test@lo") == false); // mixed modifier not allowed
cemu_assert_debug(ep.Evaluate("test+15@lo") == 20.0f);
cemu_assert_debug(ep.Evaluate("test+15@hi") == 0.0f);
}

View File

@ -29,11 +29,43 @@ inline std::from_chars_result _convFastFloatResult(fast_float::from_chars_result
template<class TType = double>
class TExpressionParser
{
// starting with Cemu 2.7 reloc modifiers (like @ha or @l) have been changed to match LLVM/GAS behavior
// they may only be used once in an expression and get applied to the final result of the expression, regardless of their location
// E.g. "var@ha + 0x20" was intepreted as ha(var) + 0x20 before, now it's interpreted as ha(var+0x20)
enum class RelocModifier
{
None = 0,
High, // @hi @h
HighArithmetic, // @ha
Low, // @lo @l
};
public:
static_assert(std::is_arithmetic_v<TType>);
using ConstantCallback_t = TType(*)(std::string_view var_name);
using FunctionCallback_t = TType(*)(std::string_view var_name, TType parameter);
static TType ExpressionFuncHA(TType input)
{
uint32 addr = (uint32)input;
addr = (((addr >> 16) + ((addr & 0x8000) ? 1 : 0)) & 0xffff);
return (TType)addr;
}
static TType ExpressionFuncHI(TType input)
{
uint32 addr = (uint32)input;
addr = (addr >> 16) & 0xffff;
return (TType)addr;
}
static TType ExpressionFuncLO(TType input)
{
uint32 addr = (uint32)input;
addr &= 0xffff;
return (TType)addr;
}
template<typename T>
T Evaluate(std::string_view expression) const
{
@ -45,6 +77,7 @@ public:
{
std::queue<std::shared_ptr<TokenBase>> output;
std::stack<std::shared_ptr<TokenBase>> operators;
RelocModifier relocModifier = RelocModifier::None;
if (expression.empty())
{
@ -73,11 +106,11 @@ public:
auto converted = (TType)ConvertString(view, &offset);
output.emplace(std::make_shared<TokenNumber>(converted));
i += offset;
last_operator_token = false;
// check for relocation modifier suffix (e.g. 74@ha)
i += ParseRelocModifier(expression.substr(i), relocModifier, false); // can throw
continue;
}
// check for variables
if (isalpha(c) || c == '_' || c == '$')
{
@ -92,8 +125,14 @@ public:
}
const size_t len = j - i;
const std::string_view view = expression.substr(i, len);
std::string_view view = expression.substr(i, len);
// check for relocation modifier
if (auto suffixPos = view.find_last_of('@'); suffixPos != std::string::npos)
{
sint32 modifierLength = ParseRelocModifier(view.substr(suffixPos), relocModifier, true);
view.remove_suffix(modifierLength);
}
// check for function
if (m_function_callback)
{
@ -358,6 +397,13 @@ public:
}
}
if (relocModifier == RelocModifier::High)
evaluation.top() = ExpressionFuncHI(evaluation.top());
else if (relocModifier == RelocModifier::HighArithmetic)
evaluation.top() = ExpressionFuncHA(evaluation.top());
else if (relocModifier == RelocModifier::Low)
evaluation.top() = ExpressionFuncLO(evaluation.top());
return evaluation.top();
}
@ -418,7 +464,6 @@ public:
{
m_function_callback = callback;
}
private:
std::unordered_map<std::string, TType> m_constants;
ConstantCallback_t m_constant_callback = nullptr;
@ -429,6 +474,64 @@ private:
return str.find('.') != std::string_view::npos;
}
// update current relocation modifier for the expression
void SetRelocationModifier(RelocModifier& relocModifier, RelocModifier newModifier) const
{
if (relocModifier == newModifier)
return;
// catch mismatching relocation modifiers (e.g. sym@ha + sym2@lo)
if (relocModifier != RelocModifier::None)
throw std::runtime_error(fmt::format("Mismatching relocation modifiers (suffix @..) in expression"));
relocModifier = newModifier;
}
// parse modifiers like @ha, @l. Throws on parse error (unless ignored). Returns length of parsed suffix including @ symbol
sint32 ParseRelocModifier(std::string_view str, RelocModifier& relocModifier, bool ignoreParseError) const
{
auto origStr = str;
if (str.empty() || str[0] != '@')
return 0;
// skip the @
str.remove_prefix(1);
if (str.empty())
return 0;
char c0 = std::tolower(str[0]);
// check for two character modifiers: lo, ha, hi
if (str.size() >= 2)
{
char c1 = std::tolower(str[1]);
if (c0 == 'h' && c1 == 'a')
{
SetRelocationModifier(relocModifier, RelocModifier::HighArithmetic);
return 3;
}
else if (c0 == 'h' && c1 == 'i')
{
SetRelocationModifier(relocModifier, RelocModifier::High);
return 3;
}
else if (c0 == 'l' && c1 == 'o')
{
SetRelocationModifier(relocModifier, RelocModifier::Low);
return 3;
}
}
// check for single character modifiers
if (c0 == 'h')
{
SetRelocationModifier(relocModifier, RelocModifier::High);
return 2;
}
else if (c0 == 'l')
{
SetRelocationModifier(relocModifier, RelocModifier::Low);
return 2;
}
if (!ignoreParseError)
throw std::runtime_error(fmt::format("Unknown relocation modifier (only @lo, @hi, @ha, @l, @h are supported) at: {}", origStr));
return 0;
}
double ConvertString(std::string_view str, size_t* index_after = nullptr) const
{
const char* strInitial = str.data();