Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve code size and compile time for local laplacian app #7927

Merged
merged 9 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/interpolate/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include ../support/Makefile.inc

.PHONY: build clean test
.SECONDARY:

build: $(BIN)/$(HL_TARGET)/filter

Expand Down
12 changes: 11 additions & 1 deletion apps/interpolate/interpolate_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
Var yo, yi, xo, xi, ci, xii, yii;
if (get_target().has_gpu_feature()) {
normalize
.never_partition_all()
.bound(x, 0, input.width())
.bound(y, 0, input.height())
.bound(c, 0, 3)
Expand All @@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
for (int l = 1; l < levels; l++) {
downsampled[l]
.compute_root()
.never_partition_all()
.reorder(c, x, y)
.unroll(c)
.gpu_tile(x, y, xi, yi, 16, 16);
Expand All @@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
for (int l = 3; l < levels; l += 2) {
interpolated[l]
.compute_root()
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
.tile(xi, yi, xii, yii, 2, 2)
Expand All @@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

upsampledx[1]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 2, 1)
.unroll(xi)
Expand All @@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

interpolated[1]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 2, 2)
.unroll(xi)
Expand All @@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

interpolated[2]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.unroll(c)
.gpu_threads(x, y);
Expand All @@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
// the local_laplacian app.
downsampled[l]
.compute_root()
.never_partition(x)
.reorder(x, c, y)
.split(y, yo, yi, 8)
.parallel(yo)
Expand All @@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator<Interpolate> {
.compute_at(downsampled[1], yi)
.reorder(c, x, y)
.unroll(c)
.vectorize(x, vec);
.vectorize(x, vec)
.never_partition(y);

normalize
.bound(x, 0, input.width())
.bound(y, 0, input.height())
.bound(c, 0, 3)
.never_partition(y)
.split(x, xo, xi, vec)
.split(y, yo, yi, 32)
.reorder(xi, c, xo, yi, yo)
Expand All @@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
interpolated[l]
.store_at(normalize, yo)
.compute_at(normalize, yi)
.never_partition_all()
.vectorize(x, vec);
}

Expand Down
1 change: 1 addition & 0 deletions apps/local_laplacian/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include ../support/Makefile.inc

.PHONY: build clean test
.SECONDARY:

build: $(BIN)/$(HL_TARGET)/process

Expand Down
52 changes: 41 additions & 11 deletions apps/local_laplacian/local_laplacian_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
// Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
Func color;
float eps = 0.01f;
color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps);
color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a bit-exact change? I guess it's very close and doesn't really matter (especially if it helps to avoid the boundary condition), just wanted to double-check.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is not bit exact, but it made more sense to me with the change to the scaling of this term. Before it took the ratio of the input color channel to the input grayscale image, and applied that ratio to the output grayscale. Now it computes the ratio of the output grayscale to the input grayscale, and applies that as a scaling factor to the input.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(The only difference is which term in the numerator gets a +eps)


// Convert back to 16-bit
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f));

/* ESTIMATES */
// (This can be useful in conjunction with RunGen and benchmarks as well
Expand All @@ -102,21 +102,36 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
// Nothing.
} else if (get_target().has_gpu_feature()) {
// GPU schedule.
// 3.19ms on an RTX 2060.
// 2.9ms on an RTX 2060.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. If loop partitioning did not impact performance, than what did in this PR? Newer LLVM? Newer CUDA driver? Or the non-bit-exact change from above?


// All loop partitioning disabled, which has no effect on runtime,
// but saves 15% compile time and 45% ptx shader code size.
remap.compute_root();
Var xi, yi;
output.compute_root().gpu_tile(x, y, xi, yi, 16, 8);
output.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, 16, 8);
for (int j = 0; j < J; j++) {
int blockw = 16, blockh = 8;
if (j > 3) {
blockw = 2;
blockh = 2;
}
if (j > 0) {
inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh);
inGPyramid[j]
.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
gPyramid[j]
.compute_root()
.reorder(k, x, y)
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
}
outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
outGPyramid[j]
.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
}
} else {
// CPU schedule.
Expand All @@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {

remap.compute_root();
Var yo;
output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8);
gray.compute_root().parallel(y, 32).vectorize(x, 8);
output
.reorder(c, x, y)
.split(y, yo, y, 64)
.parallel(yo)
.vectorize(x, 8);
gray
.compute_root()
.never_partition(y)
.parallel(y, 32)
.vectorize(x, 8);
for (int j = 1; j < 5; j++) {
inGPyramid[j]
.compute_root()
Expand All @@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
.store_at(output, yo)
.compute_at(output, y)
.fold_storage(y, 4)
.vectorize(x, 8);
.vectorize(x, 8, TailStrategy::RoundUp);
if (j > 1) {
// Turn off loop partitioning at higher pyramid levels. This
// shaves about 3% off code size and compile time without
// affecting performance.
inGPyramid[j].never_partition_all();
gPyramid[j].never_partition_all();
}
}
outGPyramid[0]
.compute_at(output, y)
.hoist_storage(output, yo)
.vectorize(x, 8);
.vectorize(x, 8, TailStrategy::RoundUp);
for (int j = 5; j < J; j++) {
inGPyramid[j].compute_root();
gPyramid[j].compute_root().parallel(k);
Expand Down
28 changes: 28 additions & 0 deletions src/Func.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,22 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) {
return *this;
}

Stage &Stage::never_partition(const std::vector<VarOrRVar> &vars) {
for (auto v : vars) {
partition(v, Partition::Never);
}
return *this;
}

Stage &Stage::never_partition_all() {
definition.schedule().touched() = true;
vector<Dim> &dims = definition.schedule().dims();
for (auto &dim : dims) {
dim.partition_policy = Partition::Never;
}
return *this;
}

Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi,
Expand Down Expand Up @@ -2342,6 +2358,18 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) {
return *this;
}

Func &Func::never_partition(const std::vector<VarOrRVar> &vars) {
invalidate_cache();
Stage(func, func.definition(), 0).never_partition(vars);
return *this;
}

Func &Func::never_partition_all() {
invalidate_cache();
Stage(func, func.definition(), 0).never_partition_all();
return *this;
}

Func &Func::bound(const Var &var, Expr min, Expr extent) {
user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n";
user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n";
Expand Down
26 changes: 26 additions & 0 deletions src/Func.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ class Stage {
Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &partition(const VarOrRVar &var, Partition partition_policy);
Stage &never_partition_all();
Stage &never_partition(const std::vector<VarOrRVar> &vars);
Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
Expand Down Expand Up @@ -380,6 +382,13 @@ class Stage {
return reorder(collected_args);
}

template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
never_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return never_partition(collected_args);
}

Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
Stage specialize(const Expr &condition);
void specialize_fail(const std::string &message);
Expand Down Expand Up @@ -1450,6 +1459,23 @@ class Func {
* The default policy is Auto. */
Func &partition(const VarOrRVar &var, Partition partition_policy);

/** Set the loop partition policy to Never for a vector of Vars and
* RVars. */
Func &never_partition(const std::vector<VarOrRVar> &vars);

/** Set the loop partition policy to Never for some number of Vars and RVars. */
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
never_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return never_partition(collected_args);
}

/** Set the loop partition policy to Never for all Vars and RVar of the
* initial definition of the Func. It must be called separately on any
* update definitions. */
Func &never_partition_all();

/** Statically declare that the range over which a function should
* be evaluated is given by the second and third arguments. This
* can let Halide perform some optimizations. E.g. if you know
Expand Down
1 change: 1 addition & 0 deletions src/Generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -3052,6 +3052,7 @@ class NamesInterface {
using LoopLevel = Halide::LoopLevel;
using MemoryType = Halide::MemoryType;
using NameMangling = Halide::NameMangling;
using Partition = Halide::Partition;
using Pipeline = Halide::Pipeline;
using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy;
using RDom = Halide::RDom;
Expand Down
Loading