Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify register assignment in NEON assembly for get_sad #2607

Merged
merged 1 commit into from
Dec 4, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 61 additions & 78 deletions src/arm/64/sad.S
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
* Copyright (c) 2020-2023, The rav1e contributors. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
Expand All @@ -14,23 +15,12 @@

.macro sad_rect width, height
function sad\width\()x\height\()_neon, export=1
.if \width == 128
movi v3.4s, #0
.else
movi v0.4s, #0
.endif
sxtw x1, w1
.if \width == 128
movi v18.4s, #0
.endif
sxtw x3, w3
mov w4, \height
.if \width == 128
mov v2.16b, v3.16b
.elseif \width >= 32
.if \width >= 16
mov v1.16b, v0.16b
.elseif \width == 16
mov v3.16b, v0.16b
.endif
b L(sad_w\width\())
endfunc
Expand All @@ -42,12 +32,12 @@ function sad4x4_neon, export=1
sxtw x3, w3
mov w4, #4
L(sad_w4):
ldr d1, [x0]
ldr d2, [x2]
ldr s2, [x0]
ldr s3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal v0.8h, v2.8b, v3.8b
bne L(sad_w4)
uaddlp v0.2s, v0.4h
uaddlp v0.1d, v0.2s
Expand Down Expand Up @@ -86,25 +76,21 @@ function sad64x64_neon, export=1
mov w4, #64
mov v1.16b, v0.16b
L(sad_w64):
ldr q16, [x0]
ldr q17, [x2]
ldr q6, [x0, #16]
ldr q7, [x2, #16]
ldr q4, [x0, #32]
ldr q5, [x2, #32]
ldr q2, [x0, #48]
ldr q3, [x2, #48]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
ldp q6, q16, [x0, #32]
ldp q7, q17, [x2, #32]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v16.8b, v17.8b
uabal2 v1.8h, v16.16b, v17.16b
uabal v0.8h, v6.8b, v7.8b
uabal2 v1.8h, v6.16b, v7.16b
uabal v0.8h, v4.8b, v5.8b
uabal2 v1.8h, v4.16b, v5.16b
uabal v0.8h, v2.8b, v3.8b
uabal2 v1.8h, v2.16b, v3.16b
uabal v0.8h, v4.8b, v5.8b
uabal2 v1.8h, v4.16b, v5.16b
uabal v0.8h, v6.8b, v7.8b
uabal2 v1.8h, v6.16b, v7.16b
uabal v0.8h, v16.8b, v17.8b
uabal2 v1.8h, v16.16b, v17.16b
bne L(sad_w64)
horizontal_long_add_16x8
endfunc
Expand All @@ -114,48 +100,47 @@ sad_rect 64, 32
sad_rect 64, 128

function sad128x128_neon, export=1
movi v3.4s, #0
movi v0.4s, #0
sxtw x1, w1
movi v18.4s, #0
sxtw x3, w3
mov w4, #128
mov v2.16b, v3.16b
mov v1.16b, v0.16b
L(sad_w128):
ldp q0, q25, [x0]
ldp q28, q26, [x2]
ldp q23, q21, [x0, #32]
ldp q24, q22, [x2, #32]
ldp q19, q16, [x0, #64]
ldp q20, q17, [x2, #64]
ldp q6, q4, [x0, #96]
ldp q7, q5, [x2, #96]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
ldp q6, q16, [x0, #32]
ldp q7, q17, [x2, #32]
ldp q18, q20, [x0, #64]
ldp q19, q21, [x2, #64]
ldp q22, q24, [x0, #96]
ldp q23, q25, [x2, #96]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabdl v18.8h, v0.8b, v28.8b
uabal2 v18.8h, v0.16b, v28.16b
uabal v18.8h, v25.8b, v26.8b
uabal2 v18.8h, v25.16b, v26.16b
uabal v18.8h, v23.8b, v24.8b
uabal2 v18.8h, v23.16b, v24.16b
uabal v18.8h, v21.8b, v22.8b
uabal2 v18.8h, v21.16b, v22.16b
uabal v18.8h, v19.8b, v20.8b
uabal2 v18.8h, v19.16b, v20.16b
uabal v18.8h, v16.8b, v17.8b
uabal2 v18.8h, v16.16b, v17.16b
uabal v18.8h, v6.8b, v7.8b
uabal2 v18.8h, v6.16b, v7.16b
uabal v18.8h, v4.8b, v5.8b
uabal2 v18.8h, v4.16b, v5.16b
uaddw v3.4s, v3.4s, v18.4h
uaddw2 v2.4s, v2.4s, v18.8h
uabdl v26.8h, v2.8b, v3.8b
uabal2 v26.8h, v2.16b, v3.16b
uabal v26.8h, v4.8b, v5.8b
uabal2 v26.8h, v4.16b, v5.16b
uabal v26.8h, v6.8b, v7.8b
uabal2 v26.8h, v6.16b, v7.16b
uabal v26.8h, v16.8b, v17.8b
uabal2 v26.8h, v16.16b, v17.16b
uabal v26.8h, v18.8b, v19.8b
uabal2 v26.8h, v18.16b, v19.16b
uabal v26.8h, v20.8b, v21.8b
uabal2 v26.8h, v20.16b, v21.16b
uabal v26.8h, v22.8b, v23.8b
uabal2 v26.8h, v22.16b, v23.16b
uabal v26.8h, v24.8b, v25.8b
uabal2 v26.8h, v24.16b, v25.16b
uaddw v1.4s, v1.4s, v26.4h
uaddw2 v0.4s, v0.4s, v26.8h
bne L(sad_w128)
add v2.4s, v2.4s, v3.4s
uaddlp v2.2d, v2.4s
dup d0, v2.d[1]
add v2.2s, v0.2s, v2.2s
umov w0, v2.s[0]
add v0.4s, v0.4s, v1.4s
uaddlp v0.2d, v0.4s
dup d3, v0.d[1]
add v0.2s, v0.2s, v3.2s
umov w0, v0.s[0]
ret
endfunc

Expand All @@ -168,17 +153,15 @@ function sad32x32_neon, export=1
mov w4, #32
mov v1.16b, v0.16b
L(sad_w32):
ldr q4, [x0]
ldr q5, [x2]
ldr q2, [x0, #16]
ldr q3, [x2, #16]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v1.8h, v4.8b, v5.8b
uabal2 v0.8h, v4.16b, v5.16b
uabal v1.8h, v2.8b, v3.8b
uabal2 v0.8h, v2.16b, v3.16b
uabal v1.8h, v4.8b, v5.8b
uabal2 v0.8h, v4.16b, v5.16b
bne L(sad_w32)
add v0.8h, v0.8h, v1.8h
horizontal_add_16x8
Expand All @@ -193,17 +176,17 @@ function sad16x16_neon, export=1
sxtw x1, w1
sxtw x3, w3
mov w4, #16
mov v3.16b, v0.16b
mov v1.16b, v0.16b
L(sad_w16):
ldr q1, [x0]
ldr q2, [x2]
ldr q2, [x0]
ldr q3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal2 v3.8h, v1.16b, v2.16b
uabal v0.8h, v2.8b, v3.8b
uabal2 v1.8h, v2.16b, v3.16b
bne L(sad_w16)
add v0.8h, v0.8h, v3.8h
add v0.8h, v0.8h, v1.8h
horizontal_add_16x8
endfunc

Expand All @@ -218,12 +201,12 @@ function sad8x8_neon, export=1
sxtw x3, w3
mov w4, #8
L(sad_w8):
ldr d1, [x0]
ldr d2, [x2]
ldr d2, [x0]
ldr d3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal v0.8h, v2.8b, v3.8b
bne L(sad_w8)
horizontal_add_16x8
endfunc
Expand Down
Loading