Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert pred cache #4265

Merged
merged 2 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion FEXCore/Scripts/json_ir_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def parse_ops(ops):
(OpArg.Type == "GPR" or
OpArg.Type == "GPRPair" or
OpArg.Type == "FPR" or
OpArg.Type == "PRED")):
OpArg.Type == "PR")):
OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class")

OpArg.Name = ArgName
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
3 changes: 0 additions & 3 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ class OpDispatchBuilder final : public IREmitter {

// Need to clear any named constants that were cached.
ClearCachedNamedConstants();

// Clear predicate cache for x87 ldst
ResetInitPredicateCache();
}

IRPair<IROp_Jump> Jump() {
Expand Down
1 change: 0 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
34 changes: 1 addition & 33 deletions FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -10,9 +9,9 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down Expand Up @@ -46,37 +45,6 @@ class IREmitter {
}
void ResetWorkingList();

// Predicate Cache Implementation
// This lives here rather than OpcodeDispatcher because x87StackOptimization Pass
// also needs it.
struct PredicateKey {
ARMEmitter::PredicatePattern Pattern;
OpSize Size;
bool operator==(const PredicateKey& rhs) const = default;
};

struct PredicateKeyHash {
size_t operator()(const PredicateKey& key) const {
return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid));
}
};
fextl::unordered_map<PredicateKey, Ref, PredicateKeyHash> InitPredicateCache;

Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) {
PredicateKey Key {Pattern, Size};
auto ValIt = InitPredicateCache.find(Key);
if (ValIt == InitPredicateCache.end()) {
auto Predicate = _InitPredicate(Size, static_cast<uint8_t>(FEXCore::ToUnderlying(Pattern)));
InitPredicateCache[Key] = Predicate;
return Predicate;
}
return ValIt->second;
}

void ResetInitPredicateCache() {
InitPredicateCache.clear();
}

/**
* @name IR allocation routines
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
Expand Down
24 changes: 20 additions & 4 deletions unittests/InstructionCountCI/X87ldst-SVE.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
},
"2-store 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 24,
"ExpectedInstructionCount": 25,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]"
Expand All @@ -56,6 +56,7 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand All @@ -68,7 +69,7 @@
},
"8-store 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 90,
"ExpectedInstructionCount": 97,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]",
Expand Down Expand Up @@ -96,6 +97,7 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -107,6 +109,7 @@
"add x21, x4, #0x14 (20)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -118,6 +121,7 @@
"add x21, x4, #0x1e (30)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -129,6 +133,7 @@
"add x21, x4, #0x28 (40)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -140,6 +145,7 @@
"add x21, x4, #0x32 (50)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -151,6 +157,7 @@
"add x21, x4, #0x3c (60)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -162,6 +169,7 @@
"add x21, x4, #0x46 (70)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand Down Expand Up @@ -193,7 +201,7 @@
},
"2-load 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 21,
"ExpectedInstructionCount": 22,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]"
Expand All @@ -202,6 +210,7 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x2 (2)",
Expand All @@ -224,7 +233,7 @@
},
"8-load 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 52,
"ExpectedInstructionCount": 59,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]",
Expand All @@ -239,18 +248,25 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"add x20, x4, #0x14 (20)",
"ptrue p2.h, vl5",
"ld1h {z4.h}, p2/z, [x20]",
"add x20, x4, #0x1e (30)",
"ptrue p2.h, vl5",
"ld1h {z5.h}, p2/z, [x20]",
"add x20, x4, #0x28 (40)",
"ptrue p2.h, vl5",
"ld1h {z6.h}, p2/z, [x20]",
"add x20, x4, #0x32 (50)",
"ptrue p2.h, vl5",
"ld1h {z7.h}, p2/z, [x20]",
"add x20, x4, #0x3c (60)",
"ptrue p2.h, vl5",
"ld1h {z8.h}, p2/z, [x20]",
"add x20, x4, #0x46 (70)",
"ptrue p2.h, vl5",
"ld1h {z9.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x8 (8)",
Expand Down
Loading