From 5c275b21de380666918b628275ef193e0d456228 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 31 Oct 2023 12:04:14 -0700 Subject: [PATCH 1/9] Improve code size and compile time for local laplacian app This reduces compile time for the manual local laplacian schedule from 4.9s to 2.2s, and reduces code size from 126k to 82k Most of the reduction comes from avoiding a pointless boundary condition in the output Func. A smaller amount comes from avoiding loop partitioning using RoundUp and Partition::Never. The Partition::Never calls are responsible for a 3% reduction in code size and compile times by themselves. This has basically no effect on runtime. It seems to reduce it very slightly, but it's in the noise. --- apps/local_laplacian/Makefile | 1 + .../local_laplacian_generator.cpp | 26 ++++++++++++++----- src/Generator.h | 1 + 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile index a9f57b4de81a..a2c9991151f8 100644 --- a/apps/local_laplacian/Makefile +++ b/apps/local_laplacian/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/process diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index ef305837c6cc..98113cb5b386 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator { // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) Func color; float eps = 0.01f; - color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps); + color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps); // Convert back to 16-bit - output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); + output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 65535.0f)); /* ESTIMATES */ // (This can be useful in conjunction with RunGen and benchmarks as well @@ -131,8 +131,15 @@ class LocalLaplacian : public Halide::Generator { remap.compute_root(); Var yo; - output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8); - gray.compute_root().parallel(y, 32).vectorize(x, 8); + output + .reorder(c, x, y) + .split(y, yo, y, 64) + .parallel(yo) + .vectorize(x, 8); + gray + .compute_root() + .parallel(y, 32) + .vectorize(x, 8); for (int j = 1; j < 5; j++) { inGPyramid[j] .compute_root() @@ -148,12 +155,19 @@ class LocalLaplacian : public Halide::Generator { .store_at(output, yo) .compute_at(output, y) .fold_storage(y, 4) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); + if (j > 1) { + // Turn off loop partitioning at higher pyramid levels. This + // shaves about 3% off code size and compile time without + // affecting performance. + inGPyramid[j].partition(x, Partition::Never); + gPyramid[j].partition(x, Partition::Never); + } } outGPyramid[0] .compute_at(output, y) .hoist_storage(output, yo) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); for (int j = 5; j < J; j++) { inGPyramid[j].compute_root(); gPyramid[j].compute_root().parallel(k); diff --git a/src/Generator.h b/src/Generator.h index 9bc335b52ed7..1df0a1dda15b 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -3052,6 +3052,7 @@ class NamesInterface { using LoopLevel = Halide::LoopLevel; using MemoryType = Halide::MemoryType; using NameMangling = Halide::NameMangling; + using Partition = Halide::Partition; using Pipeline = Halide::Pipeline; using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy; using RDom = Halide::RDom; From 71d92b995c035c14b3613ba2611f5677d3154e83 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 31 Oct 2023 12:48:53 -0700 Subject: [PATCH 2/9] Remove the partition in y on gray --- apps/local_laplacian/local_laplacian_generator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index 98113cb5b386..97852c1a9a93 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -138,6 +138,7 @@ class LocalLaplacian : public Halide::Generator { .vectorize(x, 8); gray .compute_root() + .partition(y, Partition::Never) .parallel(y, 32) .vectorize(x, 8); for (int j = 1; j < 5; j++) { From 1baac8ad55cfb8c92a1ae904a463b0278429cb3a Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 31 Oct 2023 13:29:46 -0700 Subject: [PATCH 3/9] Also update GPU schedule --- .../local_laplacian_generator.cpp | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index 97852c1a9a93..0e1be6e9cb26 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -102,10 +102,16 @@ class LocalLaplacian : public Halide::Generator { // Nothing. } else if (get_target().has_gpu_feature()) { // GPU schedule. - // 3.19ms on an RTX 2060. + // 2.9ms on an RTX 2060. + + // All loop partitioning disabled, which has no effect on runtime, + // but saves 15% compile time and 45% ptx shader code size. remap.compute_root(); Var xi, yi; - output.compute_root().gpu_tile(x, y, xi, yi, 16, 8); + output.compute_root() + .partition(x, Partition::Never) + .partition(y, Partition::Never) + .gpu_tile(x, y, xi, yi, 16, 8); for (int j = 0; j < J; j++) { int blockw = 16, blockh = 8; if (j > 3) { @@ -113,10 +119,23 @@ class LocalLaplacian : public Halide::Generator { blockh = 2; } if (j > 0) { - inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); - gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh); + inGPyramid[j] + .compute_root() + .partition(x, Partition::Never) + .partition(y, Partition::Never) + .gpu_tile(x, y, xi, yi, blockw, blockh); + gPyramid[j] + .compute_root() + .reorder(k, x, y) + .partition(x, Partition::Never) + .partition(y, Partition::Never) + .gpu_tile(x, y, xi, yi, blockw, blockh); } - outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); + outGPyramid[j] + .compute_root() + .partition(x, Partition::Never) + .partition(y, Partition::Never) + .gpu_tile(x, y, xi, yi, blockw, blockh); } } else { // CPU schedule. From 49e5e9e7af7b3b1bb301c19f5f0c6400fb1ac143 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 1 Nov 2023 10:38:35 -0700 Subject: [PATCH 4/9] Selectively disable loop partitioning in interpolate app avx512 schedule reduces code size from 95k to 60k cuda schedule reduces code size from 261k to 194k No impact on performance --- apps/interpolate/Makefile | 1 + apps/interpolate/interpolate_generator.cpp | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile index 95c165b533ee..e5760d9f0039 100644 --- a/apps/interpolate/Makefile +++ b/apps/interpolate/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/filter diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index 1e4026b9ef87..f2e14aaedf46 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -79,6 +79,8 @@ class Interpolate : public Halide::Generator { Var yo, yi, xo, xi, ci, xii, yii; if (get_target().has_gpu_feature()) { normalize + .partition(x, Partition::Never) + .partition(y, Partition::Never) .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) @@ -94,6 +96,8 @@ class Interpolate : public Halide::Generator { for (int l = 1; l < levels; l++) { downsampled[l] .compute_root() + .partition(x, Partition::Never) + .partition(y, Partition::Never) .reorder(c, x, y) .unroll(c) .gpu_tile(x, y, xi, yi, 16, 16); @@ -102,6 +106,8 @@ class Interpolate : public Halide::Generator { for (int l = 3; l < levels; l += 2) { interpolated[l] .compute_root() + .partition(x, Partition::Never) + .partition(y, Partition::Never) .reorder(c, x, y) .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp) .tile(xi, yi, xii, yii, 2, 2) @@ -114,6 +120,8 @@ class Interpolate : public Halide::Generator { upsampledx[1] .compute_at(normalize, x) + .partition(x, Partition::Never) + .partition(y, Partition::Never) .reorder(c, x, y) .tile(x, y, xi, yi, 2, 1) .unroll(xi) @@ -123,6 +131,8 @@ class Interpolate : public Halide::Generator { interpolated[1] .compute_at(normalize, x) + .partition(x, Partition::Never) + .partition(y, Partition::Never) .reorder(c, x, y) .tile(x, y, xi, yi, 2, 2) .unroll(xi) @@ -132,6 +142,8 @@ class Interpolate : public Halide::Generator { interpolated[2] .compute_at(normalize, x) + .partition(x, Partition::Never) + .partition(y, Partition::Never) .reorder(c, x, y) .unroll(c) .gpu_threads(x, y); @@ -148,6 +160,7 @@ class Interpolate : public Halide::Generator { // the local_laplacian app. downsampled[l] .compute_root() + .partition(x, Partition::Never) .reorder(x, c, y) .split(y, yo, yi, 8) .parallel(yo) @@ -165,12 +178,14 @@ class Interpolate : public Halide::Generator { .compute_at(downsampled[1], yi) .reorder(c, x, y) .unroll(c) - .vectorize(x, vec); + .vectorize(x, vec) + .partition(y, Partition::Never); normalize .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) + .partition(y, Partition::Never) .split(x, xo, xi, vec) .split(y, yo, yi, 32) .reorder(xi, c, xo, yi, yo) @@ -182,6 +197,7 @@ class Interpolate : public Halide::Generator { interpolated[l] .store_at(normalize, yo) .compute_at(normalize, yi) + .partition(x, Partition::Never) .vectorize(x, vec); } From 6411c2b7143287ff8c2ccf7e87de3f23793ba031 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sat, 11 Nov 2023 09:33:30 -0800 Subject: [PATCH 5/9] Add some syntactic sugar --- apps/interpolate/interpolate_generator.cpp | 26 +++++++---------- .../local_laplacian_generator.cpp | 18 +++++------- src/Func.cpp | 28 +++++++++++++++++++ src/Func.h | 26 +++++++++++++++++ 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index f2e14aaedf46..b9792a6ec2d2 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -79,8 +79,7 @@ class Interpolate : public Halide::Generator { Var yo, yi, xo, xi, ci, xii, yii; if (get_target().has_gpu_feature()) { normalize - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) @@ -96,8 +95,7 @@ class Interpolate : public Halide::Generator { for (int l = 1; l < levels; l++) { downsampled[l] .compute_root() - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_tile(x, y, xi, yi, 16, 16); @@ -106,8 +104,7 @@ class Interpolate : public Halide::Generator { for (int l = 3; l < levels; l += 2) { interpolated[l] .compute_root() - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp) .tile(xi, yi, xii, yii, 2, 2) @@ -120,8 +117,7 @@ class Interpolate : public Halide::Generator { upsampledx[1] .compute_at(normalize, x) - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 1) .unroll(xi) @@ -131,8 +127,7 @@ class Interpolate : public Halide::Generator { interpolated[1] .compute_at(normalize, x) - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 2) .unroll(xi) @@ -142,8 +137,7 @@ class Interpolate : public Halide::Generator { interpolated[2] .compute_at(normalize, x) - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_threads(x, y); @@ -160,7 +154,7 @@ class Interpolate : public Halide::Generator { // the local_laplacian app. downsampled[l] .compute_root() - .partition(x, Partition::Never) + .never_partition(x) .reorder(x, c, y) .split(y, yo, yi, 8) .parallel(yo) @@ -179,13 +173,13 @@ class Interpolate : public Halide::Generator { .reorder(c, x, y) .unroll(c) .vectorize(x, vec) - .partition(y, Partition::Never); + .never_partition(y); normalize .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) - .partition(y, Partition::Never) + .never_partition(y) .split(x, xo, xi, vec) .split(y, yo, yi, 32) .reorder(xi, c, xo, yi, yo) @@ -197,7 +191,7 @@ class Interpolate : public Halide::Generator { interpolated[l] .store_at(normalize, yo) .compute_at(normalize, yi) - .partition(x, Partition::Never) + .never_partition(x) .vectorize(x, vec); } diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index 0e1be6e9cb26..860540e74517 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -109,8 +109,7 @@ class LocalLaplacian : public Halide::Generator { remap.compute_root(); Var xi, yi; output.compute_root() - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .gpu_tile(x, y, xi, yi, 16, 8); for (int j = 0; j < J; j++) { int blockw = 16, blockh = 8; @@ -121,20 +120,17 @@ class LocalLaplacian : public Halide::Generator { if (j > 0) { inGPyramid[j] .compute_root() - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .gpu_tile(x, y, xi, yi, blockw, blockh); gPyramid[j] .compute_root() .reorder(k, x, y) - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .gpu_tile(x, y, xi, yi, blockw, blockh); } outGPyramid[j] .compute_root() - .partition(x, Partition::Never) - .partition(y, Partition::Never) + .never_partition_all() .gpu_tile(x, y, xi, yi, blockw, blockh); } } else { @@ -157,7 +153,7 @@ class LocalLaplacian : public Halide::Generator { .vectorize(x, 8); gray .compute_root() - .partition(y, Partition::Never) + .never_partition(y) .parallel(y, 32) .vectorize(x, 8); for (int j = 1; j < 5; j++) { @@ -180,8 +176,8 @@ class LocalLaplacian : public Halide::Generator { // Turn off loop partitioning at higher pyramid levels. This // shaves about 3% off code size and compile time without // affecting performance. - inGPyramid[j].partition(x, Partition::Never); - gPyramid[j].partition(x, Partition::Never); + inGPyramid[j].never_partition_all(); + gPyramid[j].never_partition_all(); } } outGPyramid[0] diff --git a/src/Func.cpp b/src/Func.cpp index a8190876c6b2..bd92261f7897 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -1649,6 +1649,22 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) { return *this; } +Stage &Stage::never_partition(const std::vector &vars) { + for (auto v : vars) { + partition(v, Partition::Never); + } + return *this; +} + +Stage &Stage::never_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Never; + } + return *this; +} + Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, @@ -2342,6 +2358,18 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) { return *this; } +Func &Func::never_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition(vars); + return *this; +} + +Func &Func::never_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition_all(); + return *this; +} + Func &Func::bound(const Var &var, Expr min, Expr extent) { user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n"; user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n"; diff --git a/src/Func.h b/src/Func.h index 2cad7160b823..958f57b3d413 100644 --- a/src/Func.h +++ b/src/Func.h @@ -349,6 +349,8 @@ class Stage { Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &partition(const VarOrRVar &var, Partition partition_policy); + Stage &never_partition_all(); + Stage &never_partition(const std::vector &vars); Stage &tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, @@ -380,6 +382,13 @@ class Stage { return reorder(collected_args); } + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name); Stage specialize(const Expr &condition); void specialize_fail(const std::string &message); @@ -1450,6 +1459,23 @@ class Func { * The default policy is Auto. */ Func &partition(const VarOrRVar &var, Partition partition_policy); + /** Set the loop partition policy to Never for a vector of Vars and + * RVars. */ + Func &never_partition(const std::vector &vars); + + /** Set the loop partition policy to Never for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + + /** Set the loop partition policy to Never for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &never_partition_all(); + /** Statically declare that the range over which a function should * be evaluated is given by the second and third arguments. This * can let Halide perform some optimizations. E.g. if you know From 933098c67dc4fbd5b69cc033a498ce8f82483f4b Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sat, 11 Nov 2023 09:35:20 -0800 Subject: [PATCH 6/9] Tweak code to better show intent y wasn't being partitioned, but this more clearly says "I'm optimizing for code size" --- apps/interpolate/interpolate_generator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index b9792a6ec2d2..ca751bab253f 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -191,7 +191,7 @@ class Interpolate : public Halide::Generator { interpolated[l] .store_at(normalize, yo) .compute_at(normalize, yi) - .never_partition(x) + .never_partition_all() .vectorize(x, vec); } From 5a2787ee65867c08c96c0af64aec6d538dda68a1 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 13:35:32 -0800 Subject: [PATCH 7/9] Make Partition::Always partition even in outer loop tails. --- src/Func.cpp | 30 +++++++++++++++++++++++++++++- src/Func.h | 27 +++++++++++++++++++++++++++ src/PartitionLoops.cpp | 14 ++++++++++++-- test/correctness/likely.cpp | 19 ++++++++----------- 4 files changed, 76 insertions(+), 14 deletions(-) diff --git a/src/Func.cpp b/src/Func.cpp index bd92261f7897..37b64df5af5b 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -1650,7 +1650,7 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) { } Stage &Stage::never_partition(const std::vector &vars) { - for (auto v : vars) { + for (const auto &v : vars) { partition(v, Partition::Never); } return *this; @@ -1665,6 +1665,22 @@ Stage &Stage::never_partition_all() { return *this; } +Stage &Stage::always_partition(const std::vector &vars) { + for (const auto &v : vars) { + partition(v, Partition::Always); + } + return *this; +} + +Stage &Stage::always_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Always; + } + return *this; +} + Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, @@ -2370,6 +2386,18 @@ Func &Func::never_partition_all() { return *this; } +Func &Func::always_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition(vars); + return *this; +} + +Func &Func::always_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition_all(); + return *this; +} + Func &Func::bound(const Var &var, Expr min, Expr extent) { user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n"; user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n"; diff --git a/src/Func.h b/src/Func.h index 958f57b3d413..ccadef338c29 100644 --- a/src/Func.h +++ b/src/Func.h @@ -351,6 +351,9 @@ class Stage { Stage &partition(const VarOrRVar &var, Partition partition_policy); Stage &never_partition_all(); Stage &never_partition(const std::vector &vars); + Stage &always_partition_all(); + Stage &always_partition(const std::vector &vars); + Stage &tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, @@ -389,6 +392,13 @@ class Stage { return never_partition(collected_args); } + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name); Stage specialize(const Expr &condition); void specialize_fail(const std::string &message); @@ -1476,6 +1486,23 @@ class Func { * update definitions. */ Func &never_partition_all(); + /** Set the loop partition policy to Always for a vector of Vars and + * RVars. */ + Func &always_partition(const std::vector &vars); + + /** Set the loop partition policy to Always for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + + /** Set the loop partition policy to Always for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &always_partition_all(); + /** Statically declare that the range over which a function should * be evaluated is given by the second and third arguments. This * can let Halide perform some optimizations. E.g. if you know diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 7e2060d25c49..e2cf610d373a 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -517,10 +517,13 @@ class PartitionLoops : public IRMutator { using IRMutator::visit; bool in_gpu_loop = false; + bool in_tail = false; Stmt visit(const For *op) override { - // Do not partition if the schedule explicitly forbids. - if (op->partition_policy == Partition::Never) { + // Do not partition if the schedule explicitly forbids, or if it's set + // to automatic and we're in a loop tail. + if (op->partition_policy == Partition::Never || + (op->partition_policy == Partition::Auto && in_tail)) { return IRMutator::visit(op); } @@ -687,6 +690,13 @@ class PartitionLoops : public IRMutator { // Recurse on the middle section. simpler_body = mutate(simpler_body); + // Recurse on the prologue and epilogue, just for loops set to Partition::Always + { + ScopedValue s(in_tail, true); + epilogue = mutate(epilogue); + prologue = mutate(prologue); + } + // Construct variables for the bounds of the simplified middle section Expr min_steady = op->min, max_steady = op->extent + op->min; Expr prologue_val, epilogue_val; diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp index fe834199f015..1da06bca274c 100644 --- a/test/correctness/likely.cpp +++ b/test/correctness/likely.cpp @@ -127,12 +127,12 @@ int main(int argc, char **argv) { count_partitions(g, 1); } - // The slicing applies to every loop level starting from the - // outermost one, but only recursively simplifies the clean steady - // state. It either splits things three (start, middle, end). So - // adding a boundary condition to a 2D computation will produce 5 - // code paths for the top, bottom, left, right, and center of the - // image. + // The slicing applies to every loop level starting from the outermost one, + // but only recursively simplifies the clean steady state. It either splits + // things three (start, middle, end). So adding a boundary condition to a 2D + // computation will produce 5 code paths for the top, bottom, left, right, + // and center of the image. With explicit control over loop partitioning, we + // might produce more or fewer. { Var y; Func g; @@ -144,7 +144,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition y, always partition x:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Never); count_partitions(h2, 3); // We expect left-center-right @@ -153,7 +152,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x, always partition y:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Always); count_partitions(h2, 3); // We expect top-middle-bottom @@ -162,7 +160,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Never); count_partitions(h2, 1); @@ -171,10 +168,10 @@ int main(int argc, char **argv) { { debug(1) << "Always partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Always); - count_partitions(h2, 5); + // All loops get partitioned, including the tails of outer loops, so we expect 9 zones. + count_partitions(h2, 9); } } From 729248ce7475ad6395ee025badc81a90a8f11d3b Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 13:39:55 -0800 Subject: [PATCH 8/9] Update comment --- src/LoopPartitioningDirective.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/LoopPartitioningDirective.h b/src/LoopPartitioningDirective.h index 3189add52d1a..c4c14de48f2a 100644 --- a/src/LoopPartitioningDirective.h +++ b/src/LoopPartitioningDirective.h @@ -20,8 +20,9 @@ enum class Partition { /** Disallow loop partitioning. */ Never, - /** Force partitioning of the loop. If Halide can't find a way to partition this loop, - * it will raise an error. */ + /** Force partitioning of the loop, even in the tail cases of outer + * partitioned loops. If Halide can't find a way to partition this loop, it + * will raise an error. */ Always }; From e8f1dbe02dc2440b99e49c9e642e1a2a7e7ffbd1 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 15:32:39 -0800 Subject: [PATCH 9/9] Add a little picture of the 9 zones --- test/correctness/likely.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp index 1da06bca274c..10a46ed94e2e 100644 --- a/test/correctness/likely.cpp +++ b/test/correctness/likely.cpp @@ -170,7 +170,16 @@ int main(int argc, char **argv) { Func h2 = h; h2.partition(x, Partition::Always); h2.partition(y, Partition::Always); - // All loops get partitioned, including the tails of outer loops, so we expect 9 zones. + // All loops get partitioned, including the tails of outer loops, so we expect 9 zones: + /* + ---------------------------------------------- + | top left | top middle | top right | + | ------------------------------------------ | + | left | middle | right | + | ------------------------------------------ | + | bottom left | bottom middle | bottom right | + ---------------------------------------------- + */ count_partitions(h2, 9); } }