Skip to content

Commit

Permalink
AArch64: Trim Armv8.0 Neon path of 6-tap and 8-tap MC functions
Browse files Browse the repository at this point in the history
There are some instruction sequences we could merge after the lane
load/store patch (ec5c305).

This change will simplify the loading of filter weights to save 288
bytes in the Armv8.0 Neon path of 6-tap and 8-tap MC functions.
  • Loading branch information
Arpad Panyik authored and mstorsjo committed Sep 12, 2024
1 parent f4a0d7c commit 82e9155
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 44 deletions.
33 changes: 11 additions & 22 deletions src/arm/64/mc.S
Original file line number Diff line number Diff line change
Expand Up @@ -1475,8 +1475,7 @@ L(\type\()_\taps\()_h):
20: // 2xN h
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1509,8 +1508,7 @@ L(\type\()_\taps\()_h):

40: // 4xN h
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1741,8 +1739,7 @@ function L(\type\()_\taps\()_v)
b.gt 28f

cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1821,8 +1818,7 @@ function L(\type\()_\taps\()_v)

// 4x2, 4x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1897,8 +1893,7 @@ function L(\type\()_\taps\()_v)

// 8x2, 8x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1996,8 +1991,7 @@ function L(\type\()_\taps\()_v)
b.gt 1680b

// 16x2, 16x4 v
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -2064,11 +2058,9 @@ function L(\type\()_\taps\()_hv)
20:
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 280f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]

// 2x2, 2x4 hv
sub \sr2, \src, #1
Expand Down Expand Up @@ -2202,11 +2194,9 @@ L(\type\()_\taps\()_filter_2):

40:
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 480f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
sub \sr2, \src, #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
Expand Down Expand Up @@ -2396,9 +2386,8 @@ L(\type\()_\taps\()_filter_4):
320:
AARCH64_VALID_JUMP_TARGET
b.gt 880f
add \xmy, \xmy, #2
ld1 {v0.8b}, [\xmx]
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
.ifc \taps, 6tap
sub \src, \src, #2
.else
Expand Down
33 changes: 11 additions & 22 deletions src/arm/64/mc16.S
Original file line number Diff line number Diff line change
Expand Up @@ -1618,8 +1618,7 @@ L(\type\()_\taps\()_h):
20: // 2xN h
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #2
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1651,8 +1650,7 @@ L(\type\()_\taps\()_h):

40: // 4xN h
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #2
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1859,8 +1857,7 @@ function L(\type\()_\taps\()_v)
b.gt 28f

cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -1937,8 +1934,7 @@ function L(\type\()_\taps\()_v)

// 4x2, 4x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -2002,8 +1998,7 @@ function L(\type\()_\taps\()_v)

// 8x2, 8x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
Expand Down Expand Up @@ -2091,8 +2086,7 @@ function L(\type\()_\taps\()_v)
b.gt 1680b

// 16x2, 16x4 v
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
sxtl v0.8h, v0.8b

Expand Down Expand Up @@ -2154,11 +2148,9 @@ function L(\type\()_\taps\()_hv)
20:
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 280f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]

// 2x2, 2x4 hv
sub \sr2, \src, #2
Expand Down Expand Up @@ -2301,11 +2293,9 @@ L(\type\()_\taps\()_filter_2):

40:
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 480f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
sub \sr2, \src, #2
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
Expand Down Expand Up @@ -2501,9 +2491,8 @@ L(\type\()_\taps\()_filter_4):
320:
AARCH64_VALID_JUMP_TARGET
b.gt 880f
add \xmy, \xmy, #2
ld1 {v0.8b}, [\xmx]
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
.ifc \taps, 6tap
sub \src, \src, #4
.else
Expand Down

0 comments on commit 82e9155

Please sign in to comment.