Skip to content

Commit 684ab05

Browse files
admitricigcbot
authored andcommitted
Improve CodeScheduling
- Add caching for register pressure estimation, real uses computation and values size - Implement fragmentation-aware register pressure adjustment heuristic for large loads - Add new heuristic for prioritizing loads that unlock DPAS instructions - Fix initial register pressure estimation for hoisted loads and corresponding IEs in BBIn - Fix ftobf regpressure estimation - Some changes of the whole scheduling workflow to take advantage of the backtracking - Add new heuristic to put instructions between the load and the subsequent shuffling to hide latency
1 parent f8ce0b6 commit 684ab05

File tree

10 files changed

+536
-163
lines changed

10 files changed

+536
-163
lines changed

IGC/Compiler/CISACodeGen/CodeScheduling.cpp

Lines changed: 422 additions & 116 deletions
Large diffs are not rendered by default.

IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h

Lines changed: 59 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,44 +11,87 @@ SPDX-License-Identifier: MIT
1111
// For the usage with IGC_CodeSchedulingConfig option, see CodeScheduling.cpp, class SchedulingConfig
1212
// Publicly available options are in igc_flags.h, search for CodeScheduling*
1313

14-
// Edge weigths
14+
// Generate default options line:
15+
// clang-format off
16+
// python3 -c "print('IGC_CodeSchedulingConfig=\"' + ';'.join([line.split(',')[1].strip() for line in open('CodeSchedulingOptionsDef.h') if line.strip().startswith('DECLARE_SCHEDULING_OPTION')]) + '\"')"
17+
// clang-format on
18+
19+
// Edge weights
1520
DECLARE_SCHEDULING_OPTION(DefaultWeight, 10, "Default edge weight for dependency graph")
1621
DECLARE_SCHEDULING_OPTION(UseHighRPWeight, 1, "Use alternative weights when register pressure is high")
17-
DECLARE_SCHEDULING_OPTION(Weight2dBlockReadSrcDep, 300, "Edge weight for 2D block read source dependency")
18-
DECLARE_SCHEDULING_OPTION(Weight2dBlockReadDstDep, 3000, "Edge weight for 2D block read destination dependency")
22+
DECLARE_SCHEDULING_OPTION(Weight2dBlockReadSrcDep, 0, "Edge weight for 2D block read source dependency")
23+
DECLARE_SCHEDULING_OPTION(Weight2dBlockReadDstDep, 30000, "Edge weight for 2D block read destination dependency")
1924
DECLARE_SCHEDULING_OPTION(Weight2dBlockReadDstDepHighRP, 100,
20-
"Edge weight for 2D block read destination dependency under high register pressure")
21-
DECLARE_SCHEDULING_OPTION(Weight2dBlockSetPayloadFieldDstDep, 30,
25+
"Edge weight for 2D block read destination "
26+
"dependency under high register pressure")
27+
DECLARE_SCHEDULING_OPTION(Weight2dBlockSetPayloadFieldDstDep, 0,
2228
"Edge weight for 2D block set payload field destination dependency")
2329
DECLARE_SCHEDULING_OPTION(WeightPrefetch, 100000, "Edge weight for prefetch instructions")
2430
DECLARE_SCHEDULING_OPTION(WeightDPASDstDep, 1000, "Edge weight for DPAS destination dependency")
2531
DECLARE_SCHEDULING_OPTION(WeightDPASDstDepHighRP, 6000,
2632
"Edge weight for DPAS destination dependency under high register pressure")
2733
DECLARE_SCHEDULING_OPTION(WeightExtendedMathDstDep, 200, "Edge weight for extended math destination dependency")
34+
DECLARE_SCHEDULING_OPTION(WeightWaveAllDstDep, 10, "Edge weight for wave all destination dependency")
2835
DECLARE_SCHEDULING_OPTION(WeightUnknownMemoryReadDstDep, 500,
2936
"Edge weight for unknown memory read destination dependency")
3037
DECLARE_SCHEDULING_OPTION(WeightUnknownVectorShuffleDstDep, 50,
3138
"Edge weight for unknown vector shuffle destination dependency")
39+
DECLARE_SCHEDULING_OPTION(LoadSizeAdditionalWeight, 0,
40+
"Add this weight * multiplier * load size to the basic load weight")
41+
DECLARE_SCHEDULING_OPTION(LoadSizeWeightFactor, 1,
42+
"Add additional weight * this multiplier * load size "
43+
"to the basic load weight")
44+
DECLARE_SCHEDULING_OPTION(AddWeightToTerminatorEdge, 1,
45+
"Add weight to the edge from the last instruction in "
46+
"the block to the terminator instruction (0/1)")
3247

33-
// Heurictics
34-
DECLARE_SCHEDULING_OPTION(PrioritizeLargeBlockLoadsInRP, 32,
35-
"Heuristic: Prioritize block loads larger than the value when register pressure is high")
36-
DECLARE_SCHEDULING_OPTION(PrioritizeDPASHighRP, 0,
48+
// Heuristics
49+
DECLARE_SCHEDULING_OPTION(PrioritizeLargeBlockLoadsInRP, 0,
50+
"Heuristic: Prioritize block loads larger than the "
51+
"value when register pressure is high. 0 is disabled")
52+
DECLARE_SCHEDULING_OPTION(PrioritizeDPASHighRP, 1,
3753
"Heuristic: Prioritize DPAS instructions under high register pressure")
54+
DECLARE_SCHEDULING_OPTION(PrioritizeDPASAndOtherOverImmediateVS, 1,
55+
"Heuristic: Prioritize DPAS and some other instructions before vector "
56+
"shuffle no-noop patterns that are supposed to be immediate, "
57+
" but only when the instruction doesn't increase register pressure significantly")
58+
DECLARE_SCHEDULING_OPTION(PrioritizeOverImmediateVSMaxRPInBytes, 8,
59+
"Heuristic: Maximum register pressure in bytes to consider for prioritizing "
60+
"DPAS and other instructions over immediate vector shuffle patterns")
61+
DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP, 1,
62+
"Heuristic: Prioritize loads that unlock DPAS "
63+
"instructions under high register pressure")
64+
DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP_MaxLoadSize, 32,
65+
"Heuristic: Maximum load size (in number of elements) to consider for "
66+
"prioritizing loads that unlock DPAS instructions")
3867

3968
// RP management control options
4069
DECLARE_SCHEDULING_OPTION(GreedyRPThresholdDelta, 20, "Threshold delta for greedy register pressure scheduling")
4170
DECLARE_SCHEDULING_OPTION(LowRPThresholdDelta, 200, "Unused: Threshold delta for low register pressure")
4271
DECLARE_SCHEDULING_OPTION(MinLiveIntervalForCloning, 200, "Minimum live interval for cloning instructions")
43-
DECLARE_SCHEDULING_OPTION(ReservedRegisters, 2, "Number of always reserved registers")
72+
DECLARE_SCHEDULING_OPTION(ReservedRegisters, 5, "Number of always reserved registers")
73+
DECLARE_SCHEDULING_OPTION(LargeBlockLoadSize, 16,
74+
"Size of large load to always make a checkpoint, in number of elements")
75+
DECLARE_SCHEDULING_OPTION(LargeLoadSizeForFragmentationAdjustment, 16,
76+
"Size of large load to consider for fragmentation "
77+
"adjustment, in number of elements")
78+
DECLARE_SCHEDULING_OPTION(RPMarginIncreaseForFragmentationAdjustment, 34,
79+
"Increase register pressure margin for fragmentation adjustment")
80+
DECLARE_SCHEDULING_OPTION(FragmentationAdjustmentsMinGRF, 200,
81+
"Minimum number of GRFs to apply fragmentation adjustments")
82+
DECLARE_SCHEDULING_OPTION(IgnoreFragmentationForLastLoad, 1,
83+
"Ignore fragmentation for the last load in the block, i.e. do not increase "
84+
"register pressure margin for it")
4485

4586
// Other
4687
DECLARE_SCHEDULING_OPTION(ForceSIMDSize, 16,
47-
"Force SIMD mode for scheduling, 0 is no force, 8 is SIMD8, 16 is SIMD16, etc.")
88+
"Force SIMD mode for scheduling, 0 is no force, 8 is "
89+
"SIMD8, 16 is SIMD16, etc.")
4890
DECLARE_SCHEDULING_OPTION(DefaultNumGRF, 128,
49-
"Default number of GRFs for scheduling, used when the context does not provide it")
50-
DECLARE_SCHEDULING_OPTION(
51-
DefaultNumGRFAuto, 256,
52-
"Default number of GRFs for scheduling, used when the context does not provide it, but auto is enabled")
91+
"Default number of GRFs for scheduling, used when "
92+
"the context does not provide it")
93+
DECLARE_SCHEDULING_OPTION(DefaultNumGRFAuto, 256,
94+
"Default number of GRFs for scheduling, used when the context does not "
95+
"provide it, but auto is enabled")
5396

54-
#endif
97+
#endif

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,9 @@ void AddAnalysisPasses(CodeGenContext &ctx, IGCPassManager &mpm) {
245245
mpm.add(new CodeLoopSinking());
246246
}
247247
if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER)) {
248-
mpm.add(new CodeScheduling());
248+
if (IGC_IS_FLAG_DISABLED(CodeSchedulingOnlyRecompilation) || ctx.m_retryManager.AllowCodeScheduling()) {
249+
mpm.add(new CodeScheduling());
250+
}
249251
}
250252
}
251253

IGC/Compiler/CodeGenContext.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ SPDX-License-Identifier: MIT
2222

2323
namespace IGC {
2424
struct RetryState {
25+
bool allowCodeScheduling;
2526
bool allowAddrArithCloning;
2627
bool allowLICM;
2728
bool allowCodeSinking;
@@ -39,9 +40,9 @@ struct RetryState {
3940
};
4041

4142
static const RetryState RetryTable[] = {
42-
// adrCl licm codSk AdrSk Slice PrivM VISAP URBWr Coals GRF loadSk, SyncRT, compactspills
43-
{false, true, true, false, false, true, true, true, true, false, false, false, true, 1},
44-
{true, false, true, true, true, false, false, false, false, true, true, true, false, 500}};
43+
// sched adrCl licm codSk AdrSk Slice PrivM VISAP URBWr Coals GRF loadSk, SyncRT, compactspills
44+
{false, false, true, true, false, false, true, true, true, true, false, false, false, true, 1},
45+
{true, true, false, true, true, true, false, false, false, false, true, true, true, false, 500}};
4546

4647
static constexpr size_t RetryTableSize = sizeof(RetryTable) / sizeof(RetryState);
4748

@@ -111,6 +112,12 @@ bool RetryManager::AllowCloneAddressArithmetic(Function *F) const {
111112
return RetryTable[id].allowAddrArithCloning;
112113
}
113114

115+
bool RetryManager::AllowCodeScheduling(Function *F) const {
116+
unsigned id = GetPerFuncRetryStateId(F);
117+
IGC_ASSERT(id < RetryTableSize);
118+
return RetryTable[id].allowCodeScheduling;
119+
}
120+
114121
bool RetryManager::AllowSimd32Slicing(Function *F) const {
115122
unsigned id = GetPerFuncRetryStateId(F);
116123
IGC_ASSERT(id < RetryTableSize);

IGC/Compiler/CodeGenPublic.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ class RetryManager {
675675
bool AllowCodeSinking(llvm::Function *F = nullptr) const;
676676
bool AllowAddressArithmeticSinking(llvm::Function *F = nullptr) const;
677677
bool AllowCloneAddressArithmetic(llvm::Function *F = nullptr) const;
678+
bool AllowCodeScheduling(llvm::Function *F = nullptr) const;
678679
bool AllowSimd32Slicing(llvm::Function *F = nullptr) const;
679680
bool AllowLargeURBWrite(llvm::Function *F = nullptr) const;
680681
bool AllowConstantCoalescing(llvm::Function *F = nullptr) const;

0 commit comments

Comments
 (0)