Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve code size and compile time for local laplacian app #7927

Merged
merged 9 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/interpolate/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include ../support/Makefile.inc

.PHONY: build clean test
.SECONDARY:

build: $(BIN)/$(HL_TARGET)/filter

Expand Down
12 changes: 11 additions & 1 deletion apps/interpolate/interpolate_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
Var yo, yi, xo, xi, ci, xii, yii;
if (get_target().has_gpu_feature()) {
normalize
.never_partition_all()
.bound(x, 0, input.width())
.bound(y, 0, input.height())
.bound(c, 0, 3)
Expand All @@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
for (int l = 1; l < levels; l++) {
downsampled[l]
.compute_root()
.never_partition_all()
.reorder(c, x, y)
.unroll(c)
.gpu_tile(x, y, xi, yi, 16, 16);
Expand All @@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
for (int l = 3; l < levels; l += 2) {
interpolated[l]
.compute_root()
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
.tile(xi, yi, xii, yii, 2, 2)
Expand All @@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

upsampledx[1]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 2, 1)
.unroll(xi)
Expand All @@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

interpolated[1]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.tile(x, y, xi, yi, 2, 2)
.unroll(xi)
Expand All @@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator<Interpolate> {

interpolated[2]
.compute_at(normalize, x)
.never_partition_all()
.reorder(c, x, y)
.unroll(c)
.gpu_threads(x, y);
Expand All @@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
// the local_laplacian app.
downsampled[l]
.compute_root()
.never_partition(x)
.reorder(x, c, y)
.split(y, yo, yi, 8)
.parallel(yo)
Expand All @@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator<Interpolate> {
.compute_at(downsampled[1], yi)
.reorder(c, x, y)
.unroll(c)
.vectorize(x, vec);
.vectorize(x, vec)
.never_partition(y);

normalize
.bound(x, 0, input.width())
.bound(y, 0, input.height())
.bound(c, 0, 3)
.never_partition(y)
.split(x, xo, xi, vec)
.split(y, yo, yi, 32)
.reorder(xi, c, xo, yi, yo)
Expand All @@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
interpolated[l]
.store_at(normalize, yo)
.compute_at(normalize, yi)
.never_partition_all()
.vectorize(x, vec);
}

Expand Down
1 change: 1 addition & 0 deletions apps/local_laplacian/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include ../support/Makefile.inc

.PHONY: build clean test
.SECONDARY:

build: $(BIN)/$(HL_TARGET)/process

Expand Down
52 changes: 41 additions & 11 deletions apps/local_laplacian/local_laplacian_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
// Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
Func color;
float eps = 0.01f;
color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps);
color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a bit-exact change? I guess it's very close and doesn't really matter (especially if it helps to avoid the boundary condition), just wanted to double-check.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is not bit exact, but it made more sense to me with the change to the scaling of this term. Before it took the ratio of the input color channel to the input grayscale image, and applied that ratio to the output grayscale. Now it computes the ratio of the output grayscale to the input grayscale, and applies that as a scaling factor to the input.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(The only difference is which term in the numerator gets a +eps)


// Convert back to 16-bit
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f));

/* ESTIMATES */
// (This can be useful in conjunction with RunGen and benchmarks as well
Expand All @@ -102,21 +102,36 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
// Nothing.
} else if (get_target().has_gpu_feature()) {
// GPU schedule.
// 3.19ms on an RTX 2060.
// 2.9ms on an RTX 2060.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. If loop partitioning did not impact performance, than what did in this PR? Newer LLVM? Newer CUDA driver? Or the non-bit-exact change from above?


// All loop partitioning disabled, which has no effect on runtime,
// but saves 15% compile time and 45% ptx shader code size.
remap.compute_root();
Var xi, yi;
output.compute_root().gpu_tile(x, y, xi, yi, 16, 8);
output.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, 16, 8);
for (int j = 0; j < J; j++) {
int blockw = 16, blockh = 8;
if (j > 3) {
blockw = 2;
blockh = 2;
}
if (j > 0) {
inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh);
inGPyramid[j]
.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
gPyramid[j]
.compute_root()
.reorder(k, x, y)
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
}
outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
outGPyramid[j]
.compute_root()
.never_partition_all()
.gpu_tile(x, y, xi, yi, blockw, blockh);
}
} else {
// CPU schedule.
Expand All @@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {

remap.compute_root();
Var yo;
output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8);
gray.compute_root().parallel(y, 32).vectorize(x, 8);
output
.reorder(c, x, y)
.split(y, yo, y, 64)
.parallel(yo)
.vectorize(x, 8);
gray
.compute_root()
.never_partition(y)
.parallel(y, 32)
.vectorize(x, 8);
for (int j = 1; j < 5; j++) {
inGPyramid[j]
.compute_root()
Expand All @@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
.store_at(output, yo)
.compute_at(output, y)
.fold_storage(y, 4)
.vectorize(x, 8);
.vectorize(x, 8, TailStrategy::RoundUp);
if (j > 1) {
// Turn off loop partitioning at higher pyramid levels. This
// shaves about 3% off code size and compile time without
// affecting performance.
inGPyramid[j].never_partition_all();
gPyramid[j].never_partition_all();
}
}
outGPyramid[0]
.compute_at(output, y)
.hoist_storage(output, yo)
.vectorize(x, 8);
.vectorize(x, 8, TailStrategy::RoundUp);
for (int j = 5; j < J; j++) {
inGPyramid[j].compute_root();
gPyramid[j].compute_root().parallel(k);
Expand Down
56 changes: 56 additions & 0 deletions src/Func.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,38 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) {
return *this;
}

Stage &Stage::never_partition(const std::vector<VarOrRVar> &vars) {
for (const auto &v : vars) {
partition(v, Partition::Never);
}
return *this;
}

Stage &Stage::never_partition_all() {
definition.schedule().touched() = true;
vector<Dim> &dims = definition.schedule().dims();
for (auto &dim : dims) {
dim.partition_policy = Partition::Never;
}
return *this;
}

Stage &Stage::always_partition(const std::vector<VarOrRVar> &vars) {
for (const auto &v : vars) {
partition(v, Partition::Always);
}
return *this;
}

Stage &Stage::always_partition_all() {
definition.schedule().touched() = true;
vector<Dim> &dims = definition.schedule().dims();
for (auto &dim : dims) {
dim.partition_policy = Partition::Always;
}
return *this;
}

Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi,
Expand Down Expand Up @@ -2342,6 +2374,30 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) {
return *this;
}

Func &Func::never_partition(const std::vector<VarOrRVar> &vars) {
invalidate_cache();
Stage(func, func.definition(), 0).never_partition(vars);
return *this;
}

Func &Func::never_partition_all() {
invalidate_cache();
Stage(func, func.definition(), 0).never_partition_all();
return *this;
}

Func &Func::always_partition(const std::vector<VarOrRVar> &vars) {
invalidate_cache();
Stage(func, func.definition(), 0).always_partition(vars);
return *this;
}

Func &Func::always_partition_all() {
invalidate_cache();
Stage(func, func.definition(), 0).always_partition_all();
return *this;
}

Func &Func::bound(const Var &var, Expr min, Expr extent) {
user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n";
user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n";
Expand Down
53 changes: 53 additions & 0 deletions src/Func.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ class Stage {
Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &partition(const VarOrRVar &var, Partition partition_policy);
Stage &never_partition_all();
Stage &never_partition(const std::vector<VarOrRVar> &vars);
Stage &always_partition_all();
Stage &always_partition(const std::vector<VarOrRVar> &vars);

Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
Expand Down Expand Up @@ -380,6 +385,20 @@ class Stage {
return reorder(collected_args);
}

template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
never_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return never_partition(collected_args);
}

template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
always_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return always_partition(collected_args);
}

Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
Stage specialize(const Expr &condition);
void specialize_fail(const std::string &message);
Expand Down Expand Up @@ -1450,6 +1469,40 @@ class Func {
* The default policy is Auto. */
Func &partition(const VarOrRVar &var, Partition partition_policy);

/** Set the loop partition policy to Never for a vector of Vars and
* RVars. */
Func &never_partition(const std::vector<VarOrRVar> &vars);

/** Set the loop partition policy to Never for some number of Vars and RVars. */
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
never_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return never_partition(collected_args);
}

/** Set the loop partition policy to Never for all Vars and RVar of the
* initial definition of the Func. It must be called separately on any
* update definitions. */
Func &never_partition_all();

/** Set the loop partition policy to Always for a vector of Vars and
* RVars. */
Func &always_partition(const std::vector<VarOrRVar> &vars);

/** Set the loop partition policy to Always for some number of Vars and RVars. */
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
always_partition(const VarOrRVar &x, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
return always_partition(collected_args);
}

/** Set the loop partition policy to Always for all Vars and RVar of the
* initial definition of the Func. It must be called separately on any
* update definitions. */
Func &always_partition_all();

/** Statically declare that the range over which a function should
* be evaluated is given by the second and third arguments. This
* can let Halide perform some optimizations. E.g. if you know
Expand Down
1 change: 1 addition & 0 deletions src/Generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -3052,6 +3052,7 @@ class NamesInterface {
using LoopLevel = Halide::LoopLevel;
using MemoryType = Halide::MemoryType;
using NameMangling = Halide::NameMangling;
using Partition = Halide::Partition;
using Pipeline = Halide::Pipeline;
using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy;
using RDom = Halide::RDom;
Expand Down
5 changes: 3 additions & 2 deletions src/LoopPartitioningDirective.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ enum class Partition {
/** Disallow loop partitioning. */
Never,

/** Force partitioning of the loop. If Halide can't find a way to partition this loop,
* it will raise an error. */
/** Force partitioning of the loop, even in the tail cases of outer
* partitioned loops. If Halide can't find a way to partition this loop, it
* will raise an error. */
Always
};

Expand Down
14 changes: 12 additions & 2 deletions src/PartitionLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,10 +517,13 @@ class PartitionLoops : public IRMutator {
using IRMutator::visit;

bool in_gpu_loop = false;
bool in_tail = false;

Stmt visit(const For *op) override {
// Do not partition if the schedule explicitly forbids.
if (op->partition_policy == Partition::Never) {
// Do not partition if the schedule explicitly forbids, or if it's set
// to automatic and we're in a loop tail.
if (op->partition_policy == Partition::Never ||
(op->partition_policy == Partition::Auto && in_tail)) {
return IRMutator::visit(op);
}

Expand Down Expand Up @@ -687,6 +690,13 @@ class PartitionLoops : public IRMutator {
// Recurse on the middle section.
simpler_body = mutate(simpler_body);

// Recurse on the prologue and epilogue, just for loops set to Partition::Always
{
ScopedValue<bool> s(in_tail, true);
epilogue = mutate(epilogue);
prologue = mutate(prologue);
}

// Construct variables for the bounds of the simplified middle section
Expr min_steady = op->min, max_steady = op->extent + op->min;
Expr prologue_val, epilogue_val;
Expand Down
Loading