-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve code size and compile time for local laplacian app #7927
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
5c275b2
Improve code size and compile time for local laplacian app
abadams 71d92b9
Remove the partition in y on gray
abadams 1baac8a
Also update GPU schedule
abadams 49e5e9e
Selectively disable loop partitioning in interpolate app
abadams 6411c2b
Add some syntactic sugar
abadams 933098c
Tweak code to better show intent
abadams 5a2787e
Make Partition::Always partition even in outer loop tails.
abadams 729248c
Update comment
abadams e8f1dbe
Add a little picture of the 9 zones
abadams File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
include ../support/Makefile.inc | ||
|
||
.PHONY: build clean test | ||
.SECONDARY: | ||
|
||
build: $(BIN)/$(HL_TARGET)/filter | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
include ../support/Makefile.inc | ||
|
||
.PHONY: build clean test | ||
.SECONDARY: | ||
|
||
build: $(BIN)/$(HL_TARGET)/process | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> { | |
// Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) | ||
Func color; | ||
float eps = 0.01f; | ||
color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps); | ||
color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps); | ||
|
||
// Convert back to 16-bit | ||
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); | ||
output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f)); | ||
|
||
/* ESTIMATES */ | ||
// (This can be useful in conjunction with RunGen and benchmarks as well | ||
|
@@ -102,21 +102,36 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> { | |
// Nothing. | ||
} else if (get_target().has_gpu_feature()) { | ||
// GPU schedule. | ||
// 3.19ms on an RTX 2060. | ||
// 2.9ms on an RTX 2060. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. If loop partitioning did not impact performance, than what did in this PR? Newer LLVM? Newer CUDA driver? Or the non-bit-exact change from above? |
||
|
||
// All loop partitioning disabled, which has no effect on runtime, | ||
// but saves 15% compile time and 45% ptx shader code size. | ||
remap.compute_root(); | ||
Var xi, yi; | ||
output.compute_root().gpu_tile(x, y, xi, yi, 16, 8); | ||
output.compute_root() | ||
.never_partition_all() | ||
.gpu_tile(x, y, xi, yi, 16, 8); | ||
for (int j = 0; j < J; j++) { | ||
int blockw = 16, blockh = 8; | ||
if (j > 3) { | ||
blockw = 2; | ||
blockh = 2; | ||
} | ||
if (j > 0) { | ||
inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); | ||
gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh); | ||
inGPyramid[j] | ||
.compute_root() | ||
.never_partition_all() | ||
.gpu_tile(x, y, xi, yi, blockw, blockh); | ||
gPyramid[j] | ||
.compute_root() | ||
.reorder(k, x, y) | ||
.never_partition_all() | ||
.gpu_tile(x, y, xi, yi, blockw, blockh); | ||
} | ||
outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); | ||
outGPyramid[j] | ||
.compute_root() | ||
.never_partition_all() | ||
.gpu_tile(x, y, xi, yi, blockw, blockh); | ||
} | ||
} else { | ||
// CPU schedule. | ||
|
@@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> { | |
|
||
remap.compute_root(); | ||
Var yo; | ||
output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8); | ||
gray.compute_root().parallel(y, 32).vectorize(x, 8); | ||
output | ||
.reorder(c, x, y) | ||
.split(y, yo, y, 64) | ||
.parallel(yo) | ||
.vectorize(x, 8); | ||
gray | ||
.compute_root() | ||
.never_partition(y) | ||
.parallel(y, 32) | ||
.vectorize(x, 8); | ||
for (int j = 1; j < 5; j++) { | ||
inGPyramid[j] | ||
.compute_root() | ||
|
@@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> { | |
.store_at(output, yo) | ||
.compute_at(output, y) | ||
.fold_storage(y, 4) | ||
.vectorize(x, 8); | ||
.vectorize(x, 8, TailStrategy::RoundUp); | ||
if (j > 1) { | ||
// Turn off loop partitioning at higher pyramid levels. This | ||
// shaves about 3% off code size and compile time without | ||
// affecting performance. | ||
inGPyramid[j].never_partition_all(); | ||
gPyramid[j].never_partition_all(); | ||
} | ||
} | ||
outGPyramid[0] | ||
.compute_at(output, y) | ||
.hoist_storage(output, yo) | ||
.vectorize(x, 8); | ||
.vectorize(x, 8, TailStrategy::RoundUp); | ||
for (int j = 5; j < J; j++) { | ||
inGPyramid[j].compute_root(); | ||
gPyramid[j].compute_root().parallel(k); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not a bit-exact change? I guess it's very close and doesn't really matter (especially if it helps to avoid the boundary condition), just wanted to double-check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this is not bit exact, but it made more sense to me with the change to the scaling of this term. Before it took the ratio of the input color channel to the input grayscale image, and applied that ratio to the output grayscale. Now it computes the ratio of the output grayscale to the input grayscale, and applies that as a scaling factor to the input.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(The only difference is which term in the numerator gets a +eps)