halide · abadams · Nov 21, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/filter
 

diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp
@@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
             Var yo, yi, xo, xi, ci, xii, yii;
             if (get_target().has_gpu_feature()) {
                 normalize
+                    .never_partition_all()
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
@@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 1; l < levels; l++) {
                     downsampled[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .unroll(c)
                         .gpu_tile(x, y, xi, yi, 16, 16);
@@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 3; l < levels; l += 2) {
                     interpolated[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
                         .tile(xi, yi, xii, yii, 2, 2)
@@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 upsampledx[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 1)
                     .unroll(xi)
@@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 2)
                     .unroll(xi)
@@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[2]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .unroll(c)
                     .gpu_threads(x, y);
@@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     // the local_laplacian app.
                     downsampled[l]
                         .compute_root()
+                        .never_partition(x)
                         .reorder(x, c, y)
                         .split(y, yo, yi, 8)
                         .parallel(yo)
@@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     .compute_at(downsampled[1], yi)
                     .reorder(c, x, y)
                     .unroll(c)
-                    .vectorize(x, vec);
+                    .vectorize(x, vec)
+                    .never_partition(y);
 
                 normalize
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
+                    .never_partition(y)
                     .split(x, xo, xi, vec)
                     .split(y, yo, yi, 32)
                     .reorder(xi, c, xo, yi, yo)
@@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     interpolated[l]
                         .store_at(normalize, yo)
                         .compute_at(normalize, yi)
+                        .never_partition_all()
                         .vectorize(x, vec);
                 }
 

diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/process
 

diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
         Func color;
         float eps = 0.01f;
-        color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps);
+        color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps);
 
         // Convert back to 16-bit
-        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);
+        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f));
 
         /* ESTIMATES */
         // (This can be useful in conjunction with RunGen and benchmarks as well
@@ -102,21 +102,36 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
             // Nothing.
         } else if (get_target().has_gpu_feature()) {
             // GPU schedule.
-            // 3.19ms on an RTX 2060.
+            // 2.9ms on an RTX 2060.
+
+            // All loop partitioning disabled, which has no effect on runtime,
+            // but saves 15% compile time and 45% ptx shader code size.
             remap.compute_root();
             Var xi, yi;
-            output.compute_root().gpu_tile(x, y, xi, yi, 16, 8);
+            output.compute_root()
+                .never_partition_all()
+                .gpu_tile(x, y, xi, yi, 16, 8);
             for (int j = 0; j < J; j++) {
                 int blockw = 16, blockh = 8;
                 if (j > 3) {
                     blockw = 2;
                     blockh = 2;
                 }
                 if (j > 0) {
-                    inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
-                    gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh);
+                    inGPyramid[j]
+                        .compute_root()
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
+                    gPyramid[j]
+                        .compute_root()
+                        .reorder(k, x, y)
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
                 }
-                outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
+                outGPyramid[j]
+                    .compute_root()
+                    .never_partition_all()
+                    .gpu_tile(x, y, xi, yi, blockw, blockh);
             }
         } else {
             // CPU schedule.
@@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
 
             remap.compute_root();
             Var yo;
-            output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8);
-            gray.compute_root().parallel(y, 32).vectorize(x, 8);
+            output
+                .reorder(c, x, y)
+                .split(y, yo, y, 64)
+                .parallel(yo)
+                .vectorize(x, 8);
+            gray
+                .compute_root()
+                .never_partition(y)
+                .parallel(y, 32)
+                .vectorize(x, 8);
             for (int j = 1; j < 5; j++) {
                 inGPyramid[j]
                     .compute_root()
@@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
                     .store_at(output, yo)
                     .compute_at(output, y)
                     .fold_storage(y, 4)
-                    .vectorize(x, 8);
+                    .vectorize(x, 8, TailStrategy::RoundUp);
+                if (j > 1) {
+                    // Turn off loop partitioning at higher pyramid levels. This
+                    // shaves about 3% off code size and compile time without
+                    // affecting performance.
+                    inGPyramid[j].never_partition_all();
+                    gPyramid[j].never_partition_all();
+                }
             }
             outGPyramid[0]
                 .compute_at(output, y)
                 .hoist_storage(output, yo)
-                .vectorize(x, 8);
+                .vectorize(x, 8, TailStrategy::RoundUp);
             for (int j = 5; j < J; j++) {
                 inGPyramid[j].compute_root();
                 gPyramid[j].compute_root().parallel(k);

diff --git a/src/Func.cpp b/src/Func.cpp
@@ -1649,6 +1649,22 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) {
     return *this;
 }
 
+Stage &Stage::never_partition(const std::vector<VarOrRVar> &vars) {
+    for (auto v : vars) {
+        partition(v, Partition::Never);
+    }
+    return *this;
+}
+
+Stage &Stage::never_partition_all() {
+    definition.schedule().touched() = true;
+    vector<Dim> &dims = definition.schedule().dims();
+    for (auto &dim : dims) {
+        dim.partition_policy = Partition::Never;
+    }
+    return *this;
+}
+
 Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &xo, const VarOrRVar &yo,
                    const VarOrRVar &xi, const VarOrRVar &yi,
@@ -2342,6 +2358,18 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) {
     return *this;
 }
 
+Func &Func::never_partition(const std::vector<VarOrRVar> &vars) {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).never_partition(vars);
+    return *this;
+}
+
+Func &Func::never_partition_all() {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).never_partition_all();
+    return *this;
+}
+
 Func &Func::bound(const Var &var, Expr min, Expr extent) {
     user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n";
     user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n";

diff --git a/src/Func.h b/src/Func.h
@@ -349,6 +349,8 @@ class Stage {
     Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
     Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
     Stage &partition(const VarOrRVar &var, Partition partition_policy);
+    Stage &never_partition_all();
+    Stage &never_partition(const std::vector<VarOrRVar> &vars);
     Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                 const VarOrRVar &xo, const VarOrRVar &yo,
                 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
@@ -380,6 +382,13 @@ class Stage {
         return reorder(collected_args);
     }
 
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
+    never_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return never_partition(collected_args);
+    }
+
     Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
     Stage specialize(const Expr &condition);
     void specialize_fail(const std::string &message);
@@ -1450,6 +1459,23 @@ class Func {
      * The default policy is Auto. */
     Func &partition(const VarOrRVar &var, Partition partition_policy);
 
+    /** Set the loop partition policy to Never for a vector of Vars and
+     * RVars. */
+    Func &never_partition(const std::vector<VarOrRVar> &vars);
+
+    /** Set the loop partition policy to Never for some number of Vars and RVars. */
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
+    never_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return never_partition(collected_args);
+    }
+
+    /** Set the loop partition policy to Never for all Vars and RVar of the
+     * initial definition of the Func. It must be called separately on any
+     * update definitions. */
+    Func &never_partition_all();
+
     /** Statically declare that the range over which a function should
      * be evaluated is given by the second and third arguments. This
      * can let Halide perform some optimizations. E.g. if you know

diff --git a/src/Generator.h b/src/Generator.h
@@ -3052,6 +3052,7 @@ class NamesInterface {
     using LoopLevel = Halide::LoopLevel;
     using MemoryType = Halide::MemoryType;
     using NameMangling = Halide::NameMangling;
+    using Partition = Halide::Partition;
     using Pipeline = Halide::Pipeline;
     using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy;
     using RDom = Halide::RDom;