| /* |
| * Copyright © 2021, VideoLAN and dav1d authors |
| * Copyright © 2021, Martin Storsjo |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "src/arm/asm.S" |
| #include "util.S" |
| #include "src/arm/asm-offsets.h" |
| |
| #define GRAIN_WIDTH 82 |
| #define GRAIN_HEIGHT 73 |
| |
| #define SUB_GRAIN_WIDTH 44 |
| #define SUB_GRAIN_HEIGHT 38 |
| |
| .macro increment_seed steps, shift=1 |
| lsr w11, w2, #3 |
| lsr w12, w2, #12 |
| lsr w13, w2, #1 |
| eor w11, w2, w11 // (r >> 0) ^ (r >> 3) |
| eor w12, w12, w13 // (r >> 12) ^ (r >> 1) |
| eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) |
| .if \shift |
| lsr w2, w2, #\steps |
| .endif |
| and w11, w11, #((1 << \steps) - 1) // bit |
| .if \shift |
| orr w2, w2, w11, lsl #(16 - \steps) // *state |
| .else |
| orr w2, w2, w11, lsl #16 // *state |
| .endif |
| .endm |
| |
| .macro read_rand dest, bits, age |
| ubfx \dest, x2, #16 - \bits - \age, #\bits |
| .endm |
| |
| .macro read_shift_rand dest, bits |
| ubfx \dest, x2, #17 - \bits, #\bits |
| lsr w2, w2, #1 |
| .endm |
| |
| // special calling convention: |
| // w2 holds seed |
| // x3 holds dav1d_gaussian_sequence |
| // clobbers x11-x15 |
| // returns in v0.8h |
| function get_gaussian_neon |
| increment_seed 4 |
| read_rand x14, 11, 3 |
| read_rand x15, 11, 2 |
| add x14, x3, x14, lsl #1 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[0], [x14] |
| read_rand x14, 11, 1 |
| ld1 {v0.h}[1], [x15] |
| add x14, x3, x14, lsl #1 |
| read_rand x15, 11, 0 |
| increment_seed 4 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[2], [x14] |
| read_rand x14, 11, 3 |
| ld1 {v0.h}[3], [x15] |
| add x14, x3, x14, lsl #1 |
| read_rand x15, 11, 2 |
| ld1 {v0.h}[4], [x14] |
| add x15, x3, x15, lsl #1 |
| read_rand x14, 11, 1 |
| ld1 {v0.h}[5], [x15] |
| read_rand x15, 11, 0 |
| add x14, x3, x14, lsl #1 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[6], [x14] |
| ld1 {v0.h}[7], [x15] |
| ret |
| endfunc |
| |
| .macro store_grain_row r0, r1, r2, r3, r4, r5 |
| st1 {\r0\().16b,\r1\().16b}, [x0], #32 |
| st1 {\r2\().16b,\r3\().16b}, [x0], #32 |
| st1 {\r4\().16b}, [x0], #16 |
| st1 {\r5\().h}[0], [x0], #2 |
| .endm |
| |
| function get_grain_2_neon |
| increment_seed 2 |
| read_rand x14, 11, 1 |
| read_rand x15, 11, 0 |
| add x14, x3, x14, lsl #1 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[0], [x14] |
| ld1 {v0.h}[1], [x15] |
| srshl v0.4h, v0.4h, v31.4h |
| ret |
| endfunc |
| |
| .macro get_grain_2 dst |
| bl get_grain_2_neon |
| .ifnc \dst, v0 |
| mov \dst\().8b, v0.8b |
| .endif |
| .endm |
| |
| function get_grain_4_neon |
| increment_seed 4 |
| read_rand x14, 11, 3 |
| read_rand x15, 11, 2 |
| add x14, x3, x14, lsl #1 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[0], [x14] |
| read_rand x14, 11, 1 |
| ld1 {v0.h}[1], [x15] |
| add x14, x3, x14, lsl #1 |
| read_rand x15, 11, 0 |
| add x15, x3, x15, lsl #1 |
| ld1 {v0.h}[2], [x14] |
| ld1 {v0.h}[3], [x15] |
| srshl v0.4h, v0.4h, v31.4h |
| ret |
| endfunc |
| |
| .macro get_grain_4 dst |
| bl get_grain_4_neon |
| .ifnc \dst, v0 |
| mov \dst\().8b, v0.8b |
| .endif |
| .endm |
| |
| // w15 holds the number of entries to produce |
| // w14, w16 and w17 hold the previous output entries |
| // v0 holds the vector of produced entries |
| // v1 holds the input vector of sums from above |
| .macro output_lag n |
| function output_lag\n\()_neon |
| 1: |
| read_shift_rand x13, 11 |
| mov w11, v1.s[0] |
| ldrsh w12, [x3, x13, lsl #1] |
| ext v0.16b, v0.16b, v0.16b, #2 |
| .if \n == 1 |
| madd w11, w14, w4, w11 // sum (above) + *coeff * prev output |
| .elseif \n == 2 |
| madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 |
| madd w11, w14, w17, w11 // += *coeff * prev output 2 |
| mov w16, w14 |
| .else |
| madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 |
| madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 |
| madd w11, w14, w21, w11 // += *coeff * prev output 3 |
| mov w17, w16 |
| mov w16, w14 |
| .endif |
| add w14, w11, w8 // 1 << (ar_coeff_shift - 1) |
| add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) |
| asr w14, w14, w7 // >> ar_coeff_shift |
| asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) |
| add w14, w14, w12 |
| cmp w14, w5 |
| csel w14, w14, w5, le |
| cmp w14, w6 |
| csel w14, w14, w6, ge |
| subs w15, w15, #1 |
| ext v1.16b, v1.16b, v1.16b, #4 |
| ins v0.h[7], w14 |
| b.gt 1b |
| ret |
| endfunc |
| .endm |
| |
| output_lag 1 |
| output_lag 2 |
| output_lag 3 |
| |
| |
| function sum_lag1_above_neon |
| sub x12, x0, #1*GRAIN_WIDTH*2 - 16 |
| ld1 {v18.8h}, [x12] // load top right |
| |
| ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid |
| ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right |
| |
| smull v4.4s, v17.4h, v28.4h |
| smlal v4.4s, v0.4h, v27.4h |
| smlal v4.4s, v1.4h, v29.4h |
| smull2 v5.4s, v17.8h, v28.8h |
| smlal2 v5.4s, v0.8h, v27.8h |
| smlal2 v5.4s, v1.8h, v29.8h |
| |
| mov v16.16b, v17.16b |
| mov v17.16b, v18.16b |
| |
| ret |
| endfunc |
| |
| .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff |
| bl sum_\lag\()_above_neon |
| .ifc \type, uv_420 |
| add x12, x19, #GRAIN_WIDTH*2 |
| ld1 {v22.8h, v23.8h}, [x19], #32 |
| ld1 {v24.8h, v25.8h}, [x12] |
| addp v22.8h, v22.8h, v23.8h |
| addp v23.8h, v24.8h, v25.8h |
| add v22.8h, v22.8h, v23.8h |
| srshr v0.8h, v22.8h, #2 |
| .endif |
| .ifc \type, uv_422 |
| ld1 {v22.8h, v23.8h}, [x19], #32 |
| addp v22.8h, v22.8h, v23.8h |
| srshr v0.8h, v22.8h, #1 |
| .endif |
| .ifc \type, uv_444 |
| ld1 {v0.8h}, [x19], #16 |
| .endif |
| .if \uv_layout |
| .ifnb \uv_coeff |
| dup v1.8b, \uv_coeff |
| sxtl v1.8h, v1.8b |
| smlal v4.4s, v0.4h, v1.4h |
| smlal2 v5.4s, v0.8h, v1.8h |
| .else |
| smlal v4.4s, v0.4h, v30.4h |
| smlal2 v5.4s, v0.8h, v30.8h |
| .endif |
| .endif |
| .if \uv_layout && \elems == 8 |
| b sum_\lag\()_y_\edge\()_start |
| .elseif \uv_layout == 444 && \elems == 7 |
| b sum_\lag\()_y_\edge\()_start |
| .elseif \uv_layout == 422 && \elems == 1 |
| b sum_\lag\()_uv_420_\edge\()_start |
| .else |
| sum_\lag\()_\type\()_\edge\()_start: |
| .if \elems > 4 |
| .ifc \edge, left |
| increment_seed 4 |
| read_rand x12, 11, 3 |
| read_rand x13, 11, 2 |
| read_rand x14, 11, 1 |
| add x12, x3, x12, lsl #1 |
| add x13, x3, x13, lsl #1 |
| add x14, x3, x14, lsl #1 |
| ld1 {v0.h}[5], [x12] |
| ld1 {v0.h}[6], [x13] |
| ld1 {v0.h}[7], [x14] |
| lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 |
| srshl v0.8h, v0.8h, v31.8h |
| ext v4.16b, v4.16b, v4.16b, #12 |
| .ifc \lag, lag3 |
| smov w17, v0.h[5] |
| .endif |
| .ifnc \lag, lag1 |
| smov w16, v0.h[6] |
| .endif |
| smov w14, v0.h[7] |
| |
| mov v1.16b, v4.16b |
| mov w15, #1 |
| bl output_\lag\()_neon |
| .else |
| increment_seed 4, shift=0 |
| mov v1.16b, v4.16b |
| mov w15, #4 |
| bl output_\lag\()_neon |
| .endif |
| |
| increment_seed 4, shift=0 |
| mov v1.16b, v5.16b |
| .ifc \edge, right |
| mov w15, #3 |
| bl output_\lag\()_neon |
| read_shift_rand x15, 11 |
| add x15, x3, x15, lsl #1 |
| ld1 {v1.h}[0], [x15] |
| srshl v1.4h, v1.4h, v31.4h |
| ext v0.16b, v0.16b, v1.16b, #2 |
| .else |
| mov w15, #4 |
| bl output_\lag\()_neon |
| .endif |
| .else |
| // elems == 1 |
| increment_seed 4, shift=0 |
| mov v1.16b, v4.16b |
| mov w15, #1 |
| bl output_\lag\()_neon |
| lsr w2, w2, #3 |
| |
| read_rand x12, 11, 2 |
| read_rand x13, 11, 1 |
| read_rand x14, 11, 0 |
| add x12, x3, x12, lsl #1 |
| add x13, x3, x13, lsl #1 |
| add x14, x3, x14, lsl #1 |
| ld1 {v1.h}[0], [x12] |
| ld1 {v1.h}[1], [x13] |
| ld1 {v1.h}[2], [x14] |
| srshl v1.4h, v1.4h, v31.4h |
| ext v0.16b, v0.16b, v1.16b, #14 |
| .endif |
| st1 {v0.8h}, [x0], #16 |
| ldr x30, [sp], #16 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| .endif |
| .endm |
| |
| .macro sum_lag1_func type, uv_layout, edge, elems=8 |
| function sum_\type\()_lag1_\edge\()_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| .ifc \edge, left |
| sub x12, x0, #1*GRAIN_WIDTH*2 |
| ld1 {v17.8h}, [x12] // load the previous block right above |
| .endif |
| sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems |
| endfunc |
| .endm |
| |
| sum_lag1_func y, 0, left |
| sum_lag1_func y, 0, mid |
| sum_lag1_func y, 0, right, 7 |
| sum_lag1_func uv_444, 444, left |
| sum_lag1_func uv_444, 444, mid |
| sum_lag1_func uv_444, 444, right, 7 |
| sum_lag1_func uv_422, 422, left |
| sum_lag1_func uv_422, 422, mid |
| sum_lag1_func uv_422, 422, right, 1 |
| sum_lag1_func uv_420, 420, left |
| sum_lag1_func uv_420, 420, mid |
| sum_lag1_func uv_420, 420, right, 1 |
| |
| |
| function sum_lag2_above_neon |
| sub x12, x0, #2*GRAIN_WIDTH*2 - 16 |
| sub x13, x0, #1*GRAIN_WIDTH*2 - 16 |
| ld1 {v18.8h}, [x12] // load top right |
| ld1 {v21.8h}, [x13] |
| |
| dup v26.8b, v30.b[0] |
| ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid |
| dup v27.8b, v30.b[1] |
| ext v23.16b, v16.16b, v17.16b, #14 |
| sxtl v26.8h, v26.8b |
| dup v28.8b, v30.b[3] |
| ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right |
| sxtl v27.8h, v27.8b |
| dup v29.8b, v30.b[4] |
| ext v1.16b, v17.16b, v18.16b, #4 |
| sxtl v28.8h, v28.8b |
| sxtl v29.8h, v29.8b |
| |
| smull v4.4s, v22.4h, v26.4h |
| smlal v4.4s, v23.4h, v27.4h |
| smlal v4.4s, v0.4h, v28.4h |
| smlal v4.4s, v1.4h, v29.4h |
| smull2 v5.4s, v22.8h, v26.8h |
| smlal2 v5.4s, v23.8h, v27.8h |
| smlal2 v5.4s, v0.8h, v28.8h |
| smlal2 v5.4s, v1.8h, v29.8h |
| |
| dup v26.16b, v30.b[5] |
| ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid |
| dup v27.16b, v30.b[6] |
| ext v23.16b, v19.16b, v20.16b, #14 |
| sxtl v26.8h, v26.8b |
| dup v28.16b, v30.b[8] |
| ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right |
| sxtl v27.8h, v27.8b |
| dup v29.16b, v30.b[9] |
| ext v1.16b, v20.16b, v21.16b, #4 |
| sxtl v28.8h, v28.8b |
| sxtl v29.8h, v29.8b |
| |
| smlal v4.4s, v22.4h, v26.4h |
| smlal v4.4s, v23.4h, v27.4h |
| smlal v4.4s, v0.4h, v28.4h |
| smlal v4.4s, v1.4h, v29.4h |
| smlal2 v5.4s, v22.8h, v26.8h |
| smlal2 v5.4s, v23.8h, v27.8h |
| smlal2 v5.4s, v0.8h, v28.8h |
| smlal2 v5.4s, v1.8h, v29.8h |
| |
| dup v26.16b, v30.b[2] |
| dup v27.16b, v30.b[7] |
| sxtl v26.8h, v26.8b |
| sxtl v27.8h, v27.8b |
| |
| smlal v4.4s, v17.4h, v26.4h |
| smlal v4.4s, v20.4h, v27.4h |
| smlal2 v5.4s, v17.8h, v26.8h |
| smlal2 v5.4s, v20.8h, v27.8h |
| mov v16.16b, v17.16b |
| mov v17.16b, v18.16b |
| |
| mov v19.16b, v20.16b |
| mov v20.16b, v21.16b |
| ret |
| endfunc |
| |
| .macro sum_lag2_func type, uv_layout, edge, elems=8 |
| function sum_\type\()_lag2_\edge\()_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| .ifc \edge, left |
| sub x12, x0, #2*GRAIN_WIDTH*2 |
| sub x13, x0, #1*GRAIN_WIDTH*2 |
| ld1 {v17.8h}, [x12] // load the previous block right above |
| ld1 {v20.8h}, [x13] |
| .endif |
| sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] |
| endfunc |
| .endm |
| |
| sum_lag2_func y, 0, left |
| sum_lag2_func y, 0, mid |
| sum_lag2_func y, 0, right, 7 |
| sum_lag2_func uv_444, 444, left |
| sum_lag2_func uv_444, 444, mid |
| sum_lag2_func uv_444, 444, right, 7 |
| sum_lag2_func uv_422, 422, left |
| sum_lag2_func uv_422, 422, mid |
| sum_lag2_func uv_422, 422, right, 1 |
| sum_lag2_func uv_420, 420, left |
| sum_lag2_func uv_420, 420, mid |
| sum_lag2_func uv_420, 420, right, 1 |
| |
| |
| function sum_lag3_above_neon |
| sub x11, x0, #3*GRAIN_WIDTH*2 - 16 |
| sub x12, x0, #2*GRAIN_WIDTH*2 - 16 |
| sub x13, x0, #1*GRAIN_WIDTH*2 - 16 |
| ld1 {v15.8h}, [x11] // load top right |
| ld1 {v18.8h}, [x12] |
| ld1 {v21.8h}, [x13] |
| |
| dup v22.8b, v29.b[0] |
| ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid |
| dup v23.8b, v29.b[1] |
| ext v9.16b, v13.16b, v14.16b, #12 |
| sxtl v22.8h, v22.8b |
| dup v24.8b, v29.b[2] |
| sxtl v23.8h, v23.8b |
| dup v25.8b, v29.b[3] |
| ext v10.16b, v13.16b, v14.16b, #14 |
| sxtl v24.8h, v24.8b |
| dup v26.8b, v29.b[4] |
| ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right |
| sxtl v25.8h, v25.8b |
| dup v27.8b, v29.b[5] |
| ext v12.16b, v14.16b, v15.16b, #4 |
| sxtl v26.8h, v26.8b |
| dup v28.8b, v29.b[6] |
| ext v13.16b, v14.16b, v15.16b, #6 |
| sxtl v27.8h, v27.8b |
| sxtl v28.8h, v28.8b |
| |
| smull v4.4s, v8.4h, v22.4h |
| smlal v4.4s, v9.4h, v23.4h |
| smlal v4.4s, v10.4h, v24.4h |
| smlal v4.4s, v11.4h, v26.4h |
| smlal v4.4s, v12.4h, v27.4h |
| smlal v4.4s, v13.4h, v28.4h |
| smlal v4.4s, v14.4h, v25.4h |
| smull2 v5.4s, v8.8h, v22.8h |
| smlal2 v5.4s, v9.8h, v23.8h |
| smlal2 v5.4s, v10.8h, v24.8h |
| smlal2 v5.4s, v11.8h, v26.8h |
| smlal2 v5.4s, v12.8h, v27.8h |
| smlal2 v5.4s, v13.8h, v28.8h |
| smlal2 v5.4s, v14.8h, v25.8h |
| |
| dup v22.8b, v29.b[7] |
| ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid |
| dup v23.8b, v29.b[8] |
| ext v9.16b, v16.16b, v17.16b, #12 |
| sxtl v22.8h, v22.8b |
| dup v24.8b, v29.b[9] |
| sxtl v23.8h, v23.8b |
| dup v25.8b, v29.b[10] |
| ext v10.16b, v16.16b, v17.16b, #14 |
| sxtl v24.8h, v24.8b |
| dup v26.8b, v29.b[11] |
| ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right |
| sxtl v25.8h, v25.8b |
| dup v27.8b, v29.b[12] |
| ext v12.16b, v17.16b, v18.16b, #4 |
| sxtl v26.8h, v26.8b |
| dup v28.8b, v29.b[13] |
| ext v13.16b, v17.16b, v18.16b, #6 |
| sxtl v27.8h, v27.8b |
| sxtl v28.8h, v28.8b |
| |
| smlal v4.4s, v8.4h, v22.4h |
| smlal v4.4s, v9.4h, v23.4h |
| smlal v4.4s, v10.4h, v24.4h |
| smlal v4.4s, v11.4h, v26.4h |
| smlal v4.4s, v12.4h, v27.4h |
| smlal v4.4s, v13.4h, v28.4h |
| smlal v4.4s, v17.4h, v25.4h |
| smlal2 v5.4s, v8.8h, v22.8h |
| smlal2 v5.4s, v9.8h, v23.8h |
| smlal2 v5.4s, v10.8h, v24.8h |
| smlal2 v5.4s, v11.8h, v26.8h |
| smlal2 v5.4s, v12.8h, v27.8h |
| smlal2 v5.4s, v13.8h, v28.8h |
| smlal2 v5.4s, v17.8h, v25.8h |
| |
| dup v22.8b, v29.b[14] |
| ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid |
| dup v23.8b, v29.b[15] |
| ext v9.16b, v19.16b, v20.16b, #12 |
| sxtl v22.8h, v22.8b |
| dup v24.8b, v30.b[0] |
| sxtl v23.8h, v23.8b |
| dup v25.8b, v30.b[1] |
| ext v10.16b, v19.16b, v20.16b, #14 |
| sxtl v24.8h, v24.8b |
| dup v26.8b, v30.b[2] |
| ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right |
| sxtl v25.8h, v25.8b |
| dup v27.8b, v30.b[3] |
| ext v12.16b, v20.16b, v21.16b, #4 |
| sxtl v26.8h, v26.8b |
| dup v28.8b, v30.b[4] |
| ext v13.16b, v20.16b, v21.16b, #6 |
| sxtl v27.8h, v27.8b |
| sxtl v28.8h, v28.8b |
| |
| smlal v4.4s, v8.4h, v22.4h |
| smlal v4.4s, v9.4h, v23.4h |
| smlal v4.4s, v10.4h, v24.4h |
| smlal v4.4s, v11.4h, v26.4h |
| smlal v4.4s, v12.4h, v27.4h |
| smlal v4.4s, v13.4h, v28.4h |
| smlal v4.4s, v20.4h, v25.4h |
| mov v16.16b, v17.16b |
| mov v17.16b, v18.16b |
| smlal2 v5.4s, v8.8h, v22.8h |
| smlal2 v5.4s, v9.8h, v23.8h |
| smlal2 v5.4s, v10.8h, v24.8h |
| smlal2 v5.4s, v11.8h, v26.8h |
| smlal2 v5.4s, v12.8h, v27.8h |
| smlal2 v5.4s, v13.8h, v28.8h |
| smlal2 v5.4s, v20.8h, v25.8h |
| |
| mov v13.16b, v14.16b |
| mov v14.16b, v15.16b |
| |
| mov v19.16b, v20.16b |
| mov v20.16b, v21.16b |
| ret |
| endfunc |
| |
| .macro sum_lag3_func type, uv_layout, edge, elems=8 |
| function sum_\type\()_lag3_\edge\()_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| .ifc \edge, left |
| sub x11, x0, #3*GRAIN_WIDTH*2 |
| sub x12, x0, #2*GRAIN_WIDTH*2 |
| sub x13, x0, #1*GRAIN_WIDTH*2 |
| ld1 {v14.8h}, [x11] // load the previous block right above |
| ld1 {v17.8h}, [x12] |
| ld1 {v20.8h}, [x13] |
| .endif |
| sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] |
| endfunc |
| .endm |
| |
| sum_lag3_func y, 0, left |
| sum_lag3_func y, 0, mid |
| sum_lag3_func y, 0, right, 7 |
| sum_lag3_func uv_444, 444, left |
| sum_lag3_func uv_444, 444, mid |
| sum_lag3_func uv_444, 444, right, 7 |
| sum_lag3_func uv_422, 422, left |
| sum_lag3_func uv_422, 422, mid |
| sum_lag3_func uv_422, 422, right, 1 |
| sum_lag3_func uv_420, 420, left |
| sum_lag3_func uv_420, 420, mid |
| sum_lag3_func uv_420, 420, right, 1 |
| |
| function generate_grain_rows_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| 1: |
| mov w16, #80 |
| 2: |
| bl get_gaussian_neon |
| srshl v0.8h, v0.8h, v31.8h |
| subs w16, w16, #8 |
| st1 {v0.8h}, [x0], #16 |
| b.gt 2b |
| get_grain_2 v0 |
| subs w1, w1, #1 |
| st1 {v0.s}[0], [x0], #4 |
| b.gt 1b |
| ldr x30, [sp], #16 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| endfunc |
| |
| function generate_grain_rows_44_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| 1: |
| mov w16, #40 |
| 2: |
| bl get_gaussian_neon |
| srshl v0.8h, v0.8h, v31.8h |
| subs w16, w16, #8 |
| st1 {v0.8h}, [x0], #16 |
| b.gt 2b |
| get_grain_4 v0 |
| subs w1, w1, #1 |
| st1 {v0.4h}, [x0] |
| add x0, x0, #GRAIN_WIDTH*2-80 |
| b.gt 1b |
| ldr x30, [sp], #16 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| endfunc |
| |
| function gen_grain_uv_444_lag0_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| ld1 {v4.8h}, [x19], #16 |
| gen_grain_uv_lag0_8_start: |
| bl get_gaussian_neon |
| srshl v0.8h, v0.8h, v31.8h |
| gen_grain_uv_lag0_8_add: |
| and v4.16b, v4.16b, v1.16b |
| smull v2.4s, v4.4h, v27.4h |
| smull2 v3.4s, v4.8h, v27.8h |
| srshl v2.4s, v2.4s, v28.4s |
| srshl v3.4s, v3.4s, v28.4s |
| sqxtn v2.4h, v2.4s |
| sqxtn2 v2.8h, v3.4s |
| sqadd v2.8h, v2.8h, v0.8h |
| smin v2.8h, v2.8h, v25.8h |
| smax v2.8h, v2.8h, v26.8h |
| st1 {v2.8h}, [x0], #16 |
| ldr x30, [sp], #16 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| endfunc |
| |
| function gen_grain_uv_420_lag0_8_neon |
| AARCH64_SIGN_LINK_REGISTER |
| add x12, x19, #GRAIN_WIDTH*2 |
| str x30, [sp, #-16]! |
| ld1 {v16.8h, v17.8h}, [x19], #32 |
| ld1 {v18.8h, v19.8h}, [x12] |
| addp v16.8h, v16.8h, v17.8h |
| addp v17.8h, v18.8h, v19.8h |
| add v16.8h, v16.8h, v17.8h |
| srshr v4.8h, v16.8h, #2 |
| b gen_grain_uv_lag0_8_start |
| endfunc |
| |
| function gen_grain_uv_422_lag0_8_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| ld1 {v16.8h, v17.8h}, [x19], #32 |
| addp v16.8h, v16.8h, v17.8h |
| srshr v4.8h, v16.8h, #1 |
| b gen_grain_uv_lag0_8_start |
| endfunc |
| |
| function gen_grain_uv_420_lag0_4_neon |
| add x12, x19, #GRAIN_WIDTH*2 |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| ld1 {v16.4h, v17.4h}, [x19] |
| ld1 {v18.4h, v19.4h}, [x12] |
| add x19, x19, #32 |
| addp v16.4h, v16.4h, v17.4h |
| addp v17.4h, v18.4h, v19.4h |
| add v16.4h, v16.4h, v17.4h |
| srshr v4.4h, v16.4h, #2 |
| get_grain_4 v0 |
| b gen_grain_uv_lag0_8_add |
| endfunc |
| |
| function gen_grain_uv_422_lag0_4_neon |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-16]! |
| ld1 {v16.4h, v17.4h}, [x19] |
| add x19, x19, #32 |
| addp v16.4h, v16.4h, v17.4h |
| srshr v4.4h, v16.4h, #1 |
| get_grain_4 v0 |
| b gen_grain_uv_lag0_8_add |
| endfunc |
| |
| .macro gen_grain_82 type |
| function generate_grain_\type\()_16bpc_neon, export=1 |
| AARCH64_SIGN_LINK_REGISTER |
| stp x30, x19, [sp, #-96]! |
| |
| .ifc \type, uv_444 |
| mov w13, w3 |
| mov w14, #28 |
| add x19, x1, #3*GRAIN_WIDTH*2 |
| mov x1, x2 |
| mul w13, w13, w14 |
| clz w15, w4 |
| .else |
| clz w15, w2 |
| .endif |
| movrel x3, X(gaussian_sequence) |
| sub w15, w15, #24 // -bitdepth_min_8 |
| ldr w2, [x1, #FGD_SEED] |
| ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] |
| .ifc \type, y |
| add x4, x1, #FGD_AR_COEFFS_Y |
| .else |
| add x4, x1, #FGD_AR_COEFFS_UV |
| .endif |
| add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 |
| adr x16, L(gen_grain_\type\()_tbl) |
| ldr w17, [x1, #FGD_AR_COEFF_LAG] |
| add w9, w9, #4 |
| ldrh w17, [x16, w17, uxtw #1] |
| dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift |
| sub x16, x16, w17, uxtw |
| neg v31.8h, v31.8h |
| |
| .ifc \type, uv_444 |
| cmp w13, #0 |
| mov w11, #0x49d8 |
| mov w14, #0xb524 |
| add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] |
| csel w11, w11, w14, ne |
| .endif |
| |
| ldr w7, [x1, #FGD_AR_COEFF_SHIFT] |
| neg w15, w15 // bitdepth_min_8 |
| mov w8, #1 |
| mov w10, #1 |
| lsl w8, w8, w7 // 1 << ar_coeff_shift |
| lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) |
| lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) |
| lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) |
| mov w5, #128 |
| lsl w5, w5, w15 // 128 << bitdepth_min_8 |
| neg w6, w5 // -(128 << bitpdeth_min_8) |
| sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 |
| |
| .ifc \type, uv_444 |
| eor w2, w2, w11 |
| .endif |
| |
| br x16 |
| |
| L(generate_grain_\type\()_lag0): |
| AARCH64_VALID_JUMP_TARGET |
| .ifc \type, y |
| mov w1, #GRAIN_HEIGHT |
| bl generate_grain_rows_neon |
| .else |
| dup v28.4s, w7 |
| ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] |
| movi v0.16b, #0 |
| movi v1.16b, #255 |
| dup v25.8h, w5 |
| dup v26.8h, w6 |
| ext v29.16b, v0.16b, v1.16b, #10 |
| ext v30.16b, v1.16b, v0.16b, #2 |
| neg v28.4s, v28.4s |
| sxtl v27.8h, v27.8b |
| |
| mov w1, #3 |
| bl generate_grain_rows_neon |
| mov w1, #GRAIN_HEIGHT-3 |
| 1: |
| mov v1.16b, v29.16b |
| bl gen_grain_uv_444_lag0_neon // 8 |
| movi v1.16b, #255 |
| bl gen_grain_uv_444_lag0_neon // 16 |
| bl gen_grain_uv_444_lag0_neon // 24 |
| bl gen_grain_uv_444_lag0_neon // 32 |
| bl gen_grain_uv_444_lag0_neon // 40 |
| bl gen_grain_uv_444_lag0_neon // 48 |
| bl gen_grain_uv_444_lag0_neon // 56 |
| bl gen_grain_uv_444_lag0_neon // 64 |
| bl gen_grain_uv_444_lag0_neon // 72 |
| mov v1.16b, v30.16b |
| bl gen_grain_uv_444_lag0_neon // 80 |
| get_grain_2 v16 |
| subs w1, w1, #1 |
| add x19, x19, #4 |
| st1 {v16.s}[0], [x0], #4 |
| b.gt 1b |
| .endif |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag1): |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] |
| ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] |
| ld1r {v29.8b}, [x4] // ar_coeffs_y[2] |
| .ifc \type, y |
| ldrsb w4, [x4, #1] // ar_coeffs_y[3] |
| .else |
| add x4, x4, #2 |
| .endif |
| |
| mov w1, #3 |
| .ifc \type, uv_444 |
| ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] |
| ldursb w4, [x4, #-1] // ar_coeffs_uv[3] |
| .endif |
| bl generate_grain_rows_neon |
| sxtl v27.8h, v27.8b |
| sxtl v28.8h, v28.8b |
| sxtl v29.8h, v29.8b |
| .ifc \type, uv_444 |
| sxtl v30.8h, v30.8b |
| .endif |
| |
| mov w1, #GRAIN_HEIGHT - 3 |
| 1: |
| bl sum_\type\()_lag1_left_neon // 8 |
| bl sum_\type\()_lag1_mid_neon // 16 |
| bl sum_\type\()_lag1_mid_neon // 24 |
| bl sum_\type\()_lag1_mid_neon // 32 |
| bl sum_\type\()_lag1_mid_neon // 40 |
| bl sum_\type\()_lag1_mid_neon // 48 |
| bl sum_\type\()_lag1_mid_neon // 56 |
| bl sum_\type\()_lag1_mid_neon // 64 |
| bl sum_\type\()_lag1_mid_neon // 72 |
| bl sum_\type\()_lag1_right_neon // 80 |
| get_grain_2 v16 |
| subs w1, w1, #1 |
| .ifc \type, uv_444 |
| add x19, x19, #4 |
| .endif |
| st1 {v16.s}[0], [x0], #4 |
| b.gt 1b |
| |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag2): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] |
| |
| smov w4, v30.b[10] |
| smov w17, v30.b[11] |
| |
| mov w1, #3 |
| bl generate_grain_rows_neon |
| |
| mov w1, #GRAIN_HEIGHT - 3 |
| 1: |
| bl sum_\type\()_lag2_left_neon // 8 |
| bl sum_\type\()_lag2_mid_neon // 16 |
| bl sum_\type\()_lag2_mid_neon // 24 |
| bl sum_\type\()_lag2_mid_neon // 32 |
| bl sum_\type\()_lag2_mid_neon // 40 |
| bl sum_\type\()_lag2_mid_neon // 48 |
| bl sum_\type\()_lag2_mid_neon // 56 |
| bl sum_\type\()_lag2_mid_neon // 64 |
| bl sum_\type\()_lag2_mid_neon // 72 |
| bl sum_\type\()_lag2_right_neon // 80 |
| get_grain_2 v16 |
| subs w1, w1, #1 |
| .ifc \type, uv_444 |
| add x19, x19, #4 |
| .endif |
| st1 {v16.s}[0], [x0], #4 |
| b.gt 1b |
| |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag3): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| stp x20, x21, [sp, #80] |
| |
| smov w4, v30.b[5] |
| smov w20, v30.b[6] |
| smov w21, v30.b[7] |
| |
| mov w1, #3 |
| bl generate_grain_rows_neon |
| |
| mov w1, #GRAIN_HEIGHT - 3 |
| 1: |
| bl sum_\type\()_lag3_left_neon // 8 |
| bl sum_\type\()_lag3_mid_neon // 16 |
| bl sum_\type\()_lag3_mid_neon // 24 |
| bl sum_\type\()_lag3_mid_neon // 32 |
| bl sum_\type\()_lag3_mid_neon // 40 |
| bl sum_\type\()_lag3_mid_neon // 48 |
| bl sum_\type\()_lag3_mid_neon // 56 |
| bl sum_\type\()_lag3_mid_neon // 64 |
| bl sum_\type\()_lag3_mid_neon // 72 |
| bl sum_\type\()_lag3_right_neon // 80 |
| get_grain_2 v16 |
| subs w1, w1, #1 |
| .ifc \type, uv_444 |
| add x19, x19, #4 |
| .endif |
| st1 {v16.s}[0], [x0], #4 |
| b.gt 1b |
| |
| ldp x20, x21, [sp, #80] |
| ldp d14, d15, [sp, #64] |
| ldp d12, d13, [sp, #48] |
| ldp d10, d11, [sp, #32] |
| ldp d8, d9, [sp, #16] |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(gen_grain_\type\()_tbl): |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) |
| endfunc |
| .endm |
| |
| gen_grain_82 y |
| gen_grain_82 uv_444 |
| |
| .macro set_height dst, type |
| .ifc \type, uv_420 |
| mov \dst, #SUB_GRAIN_HEIGHT-3 |
| .else |
| mov \dst, #GRAIN_HEIGHT-3 |
| .endif |
| .endm |
| |
| .macro increment_y_ptr reg, type |
| .ifc \type, uv_420 |
| add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) |
| .else |
| sub \reg, \reg, #6*32-GRAIN_WIDTH*2 |
| .endif |
| .endm |
| |
| .macro gen_grain_44 type |
| function generate_grain_\type\()_16bpc_neon, export=1 |
| AARCH64_SIGN_LINK_REGISTER |
| stp x30, x19, [sp, #-96]! |
| |
| mov w13, w3 |
| mov w14, #28 |
| add x19, x1, #(3*GRAIN_WIDTH-3)*2 |
| mov x1, x2 |
| mul w13, w13, w14 |
| clz w15, w4 |
| |
| movrel x3, X(gaussian_sequence) |
| sub w15, w15, #24 // -bitdepth_min_8 |
| ldr w2, [x1, #FGD_SEED] |
| ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] |
| add x4, x1, #FGD_AR_COEFFS_UV |
| add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 |
| adr x16, L(gen_grain_\type\()_tbl) |
| ldr w17, [x1, #FGD_AR_COEFF_LAG] |
| add w9, w9, #4 |
| ldrh w17, [x16, w17, uxtw #1] |
| dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift |
| sub x16, x16, w17, uxtw |
| neg v31.8h, v31.8h |
| |
| cmp w13, #0 |
| mov w11, #0x49d8 |
| mov w14, #0xb524 |
| add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] |
| csel w11, w11, w14, ne |
| |
| ldr w7, [x1, #FGD_AR_COEFF_SHIFT] |
| neg w15, w15 // bitdepth_min_8 |
| mov w8, #1 |
| mov w10, #1 |
| lsl w8, w8, w7 // 1 << ar_coeff_shift |
| lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) |
| lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) |
| lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) |
| mov w5, #128 |
| lsl w5, w5, w15 // 128 << bitdepth_min_8 |
| neg w6, w5 // -(128 << bitpdeth_min_8) |
| sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 |
| |
| eor w2, w2, w11 |
| |
| br x16 |
| |
| L(generate_grain_\type\()_lag0): |
| AARCH64_VALID_JUMP_TARGET |
| dup v28.4s, w7 |
| ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] |
| movi v0.16b, #0 |
| movi v1.16b, #255 |
| dup v25.8h, w5 |
| dup v26.8h, w6 |
| ext v29.16b, v0.16b, v1.16b, #10 |
| ext v30.16b, v1.16b, v0.16b, #14 |
| neg v28.4s, v28.4s |
| sxtl v27.8h, v27.8b |
| |
| mov w1, #3 |
| bl generate_grain_rows_44_neon |
| set_height w1, \type |
| 1: |
| mov v1.16b, v29.16b |
| bl gen_grain_\type\()_lag0_8_neon // 8 |
| movi v1.16b, #255 |
| bl gen_grain_\type\()_lag0_8_neon // 16 |
| bl gen_grain_\type\()_lag0_8_neon // 24 |
| bl gen_grain_\type\()_lag0_8_neon // 32 |
| bl gen_grain_\type\()_lag0_8_neon // 40 |
| mov v1.16b, v30.16b |
| bl gen_grain_\type\()_lag0_4_neon // 44 |
| subs w1, w1, #1 |
| increment_y_ptr x19, \type |
| add x0, x0, #GRAIN_WIDTH*2-6*16 |
| b.gt 1b |
| |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag1): |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] |
| ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] |
| ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] |
| add x4, x4, #2 |
| |
| mov w1, #3 |
| ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] |
| ldursb w4, [x4, #-1] // ar_coeffs_uv[3] |
| bl generate_grain_rows_44_neon |
| |
| sxtl v27.8h, v27.8b |
| sxtl v28.8h, v28.8b |
| sxtl v29.8h, v29.8b |
| sxtl v30.8h, v30.8b |
| set_height w1, \type |
| 1: |
| bl sum_\type\()_lag1_left_neon // 8 |
| bl sum_\type\()_lag1_mid_neon // 16 |
| bl sum_\type\()_lag1_mid_neon // 24 |
| bl sum_\type\()_lag1_mid_neon // 32 |
| bl sum_\type\()_lag1_mid_neon // 40 |
| bl sum_\type\()_lag1_right_neon // 44 |
| subs w1, w1, #1 |
| increment_y_ptr x19, \type |
| add x0, x0, #GRAIN_WIDTH*2-6*16 |
| b.gt 1b |
| |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag2): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] |
| |
| smov w4, v30.b[10] |
| smov w17, v30.b[11] |
| |
| mov w1, #3 |
| bl generate_grain_rows_44_neon |
| |
| set_height w1, \type |
| 1: |
| bl sum_\type\()_lag2_left_neon // 8 |
| bl sum_\type\()_lag2_mid_neon // 16 |
| bl sum_\type\()_lag2_mid_neon // 24 |
| bl sum_\type\()_lag2_mid_neon // 32 |
| bl sum_\type\()_lag2_mid_neon // 40 |
| bl sum_\type\()_lag2_right_neon // 44 |
| subs w1, w1, #1 |
| increment_y_ptr x19, \type |
| add x0, x0, #GRAIN_WIDTH*2-6*16 |
| b.gt 1b |
| |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(generate_grain_\type\()_lag3): |
| AARCH64_VALID_JUMP_TARGET |
| ldr q29, [x4] // ar_coeffs_uv[0-15] |
| ldr q30, [x4, #16] // ar_coeffs_uv[16-24] |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| stp x20, x21, [sp, #80] |
| |
| smov w4, v30.b[5] |
| smov w20, v30.b[6] |
| smov w21, v30.b[7] |
| |
| mov w1, #3 |
| bl generate_grain_rows_44_neon |
| |
| set_height w1, \type |
| 1: |
| bl sum_\type\()_lag3_left_neon // 8 |
| bl sum_\type\()_lag3_mid_neon // 16 |
| bl sum_\type\()_lag3_mid_neon // 24 |
| bl sum_\type\()_lag3_mid_neon // 32 |
| bl sum_\type\()_lag3_mid_neon // 40 |
| bl sum_\type\()_lag3_right_neon // 44 |
| subs w1, w1, #1 |
| increment_y_ptr x19, \type |
| add x0, x0, #GRAIN_WIDTH*2-6*16 |
| b.gt 1b |
| |
| ldp x20, x21, [sp, #80] |
| ldp d14, d15, [sp, #64] |
| ldp d12, d13, [sp, #48] |
| ldp d10, d11, [sp, #32] |
| ldp d8, d9, [sp, #16] |
| ldp x30, x19, [sp], #96 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(gen_grain_\type\()_tbl): |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) |
| .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) |
| endfunc |
| .endm |
| |
| gen_grain_44 uv_420 |
| gen_grain_44 uv_422 |
| |
| .macro gather_interleaved dst1, dst2, src1, src2, off |
| umov w14, \src1[0] |
| umov w15, \src2[1] |
| umov w16, \src1[2] |
| add x14, x14, x3 |
| umov w17, \src2[3] |
| add x15, x15, x3 |
| ld1 {\dst1}[0+\off], [x14] |
| umov w14, \src1[4] |
| add x16, x16, x3 |
| ld1 {\dst2}[1+\off], [x15] |
| umov w15, \src2[5] |
| add x17, x17, x3 |
| ld1 {\dst1}[2+\off], [x16] |
| umov w16, \src1[6] |
| add x14, x14, x3 |
| ld1 {\dst2}[3+\off], [x17] |
| umov w17, \src2[7] |
| add x15, x15, x3 |
| ld1 {\dst1}[4+\off], [x14] |
| add x16, x16, x3 |
| ld1 {\dst2}[5+\off], [x15] |
| add x17, x17, x3 |
| ld1 {\dst1}[6+\off], [x16] |
| ld1 {\dst2}[7+\off], [x17] |
| .endm |
| |
| .macro gather dst1, dst2, src1, src2, src3, src4 |
| gather_interleaved \dst1, \dst2, \src1, \src3, 0 |
| gather_interleaved \dst2, \dst1, \src3, \src1, 0 |
| gather_interleaved \dst1, \dst2, \src2, \src4, 8 |
| gather_interleaved \dst2, \dst1, \src4, \src2, 8 |
| .endm |
| |
| function gather32_neon |
| gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h |
| ret |
| endfunc |
| |
| function gather16_neon |
| gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 |
| gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 |
| ins v6.d[1], v7.d[0] |
| ret |
| endfunc |
| |
| const overlap_coeffs_0, align=4 |
| .short 27, 17, 0, 0 |
| .short 17, 27, 32, 32 |
| endconst |
| |
| const overlap_coeffs_1, align=4 |
| .short 23, 0, 0, 0 |
| .short 22, 32, 32, 32 |
| endconst |
| |
| .macro calc_offset offx, offy, src, sx, sy |
| and \offy, \src, #0xF // randval & 0xF |
| lsr \offx, \src, #4 // randval >> 4 |
| .if \sy == 0 |
| add \offy, \offy, \offy // 2 * (randval & 0xF) |
| .endif |
| .if \sx == 0 |
| add \offx, \offx, \offx // 2 * (randval >> 4) |
| .endif |
| .endm |
| |
| .macro add_offset dst, offx, offy, src, stride |
| madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy |
| add \dst, \dst, \offx, uxtw #1 // grain_lut += offx |
| .endm |
| |
| // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, |
| // const ptrdiff_t stride, |
| // const uint8_t scaling[SCALING_SIZE], |
| // const int scaling_shift, |
| // const entry grain_lut[][GRAIN_WIDTH], |
| // const int offsets[][2], |
| // const int h, const ptrdiff_t clip, |
| // const ptrdiff_t type, |
| // const int bitdepth_max); |
| function fgy_32x32_16bpc_neon, export=1 |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-80]! |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| str d14, [sp, #64] |
| eor w4, w4, #15 // 15 - scaling_shift |
| ldr w11, [x6, #8] // offsets[1][0] |
| ldr w13, [x6, #4] // offsets[0][1] |
| ldr w15, [x6, #12] // offsets[1][1] |
| ldr w10, [sp, #96] // bitdepth_max |
| ldr w6, [x6] // offsets[0][0] |
| dup v26.8h, w10 // bitdepth_max |
| clz w10, w10 |
| ldr w8, [sp, #80] // clip |
| sub w10, w10, #24 // -bitdepth_min_8 |
| mov x9, #GRAIN_WIDTH*2 // grain_lut stride |
| neg w10, w10 // bitdepth_min_8 |
| |
| dup v29.8h, w4 // 15 - scaling_shift |
| dup v27.8h, w10 // bitdepth_min_8 |
| |
| movrel x16, overlap_coeffs_0 |
| |
| cbz w8, 1f |
| // clip |
| movi v30.8h, #16 |
| movi v31.8h, #235 |
| sshl v30.8h, v30.8h, v27.8h |
| sshl v31.8h, v31.8h, v27.8h |
| b 2f |
| 1: |
| // no clip |
| movi v30.8h, #0 |
| mov v31.16b, v26.16b // bitdepth_max |
| 2: |
| |
| ushr v26.8h, v26.8h, #1 // grain_max |
| not v25.16b, v26.16b // grain_min |
| |
| ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs |
| |
| add x5, x5, #18 // grain_lut += 9 |
| add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride |
| add x5, x5, x9 // grain_lut += grain_stride |
| |
| calc_offset w11, w12, w11, 0, 0 |
| calc_offset w13, w14, w13, 0, 0 |
| calc_offset w15, w16, w15, 0, 0 |
| calc_offset w6, w10, w6, 0, 0 |
| |
| add_offset x12, w11, x12, x5, x9 |
| add_offset x14, w13, x14, x5, x9 |
| add_offset x16, w15, x16, x5, x9 |
| add_offset x5, w6, x10, x5, x9 |
| |
| ldr w11, [sp, #88] // type |
| adr x13, L(fgy_loop_tbl) |
| |
| add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx |
| add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by |
| |
| tst w11, #1 |
| ldrh w11, [x13, w11, uxtw #1] |
| |
| add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by |
| add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx |
| |
| sub x11, x13, w11, uxtw |
| |
| b.eq 1f |
| // y overlap |
| dup v8.8h, v27.h[0] |
| dup v9.8h, v27.h[1] |
| mov w10, w7 // backup actual h |
| mov w7, #2 |
| 1: |
| br x11 |
| endfunc |
| |
| function fgy_loop_neon |
| .macro fgy ox, oy |
| L(loop_\ox\oy): |
| AARCH64_VALID_JUMP_TARGET |
| 1: |
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src |
| .if \ox |
| ld1 {v20.4h}, [x4], x9 // grain_lut old |
| .endif |
| .if \oy |
| ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top |
| .endif |
| .if \ox && \oy |
| ld1 {v14.4h}, [x8], x9 // grain_lut top old |
| .endif |
| mvni v4.8h, #0xf0, lsl #8 // 0x0fff |
| ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut |
| |
| // Make sure that uninitialized pixels out of range past the right |
| // edge are in range; their actual values shouldn't matter. |
| and v0.16b, v0.16b, v4.16b |
| and v1.16b, v1.16b, v4.16b |
| and v2.16b, v2.16b, v4.16b |
| and v3.16b, v3.16b, v4.16b |
| bl gather32_neon |
| |
| .if \ox |
| smull v20.4s, v20.4h, v27.4h |
| smlal v20.4s, v16.4h, v28.4h |
| .endif |
| |
| .if \oy |
| .if \ox |
| smull v14.4s, v14.4h, v27.4h |
| smlal v14.4s, v21.4h, v28.4h |
| sqrshrn v20.4h, v20.4s, #5 |
| sqrshrn v14.4h, v14.4s, #5 |
| smin v20.4h, v20.4h, v26.4h |
| smin v14.4h, v14.4h, v26.4h |
| smax v20.4h, v20.4h, v25.4h |
| smax v14.4h, v14.4h, v25.4h |
| .endif |
| |
| .if \ox |
| smull v10.4s, v20.4h, v9.4h |
| .else |
| smull v10.4s, v16.4h, v9.4h |
| .endif |
| smull2 v11.4s, v16.8h, v9.8h |
| smull v12.4s, v17.4h, v9.4h |
| smull2 v13.4s, v17.8h, v9.8h |
| smull v16.4s, v18.4h, v9.4h |
| smull2 v17.4s, v18.8h, v9.8h |
| smull v18.4s, v19.4h, v9.4h |
| smull2 v19.4s, v19.8h, v9.8h |
| .if \ox |
| smlal v10.4s, v14.4h, v8.4h |
| .else |
| smlal v10.4s, v21.4h, v8.4h |
| .endif |
| smlal2 v11.4s, v21.8h, v8.8h |
| smlal v12.4s, v22.4h, v8.4h |
| smlal2 v13.4s, v22.8h, v8.8h |
| smlal v16.4s, v23.4h, v8.4h |
| smlal2 v17.4s, v23.8h, v8.8h |
| smlal v18.4s, v24.4h, v8.4h |
| smlal2 v19.4s, v24.8h, v8.8h |
| sqrshrn v10.4h, v10.4s, #5 |
| sqrshrn2 v10.8h, v11.4s, #5 |
| sqrshrn v11.4h, v12.4s, #5 |
| sqrshrn2 v11.8h, v13.4s, #5 |
| sqrshrn v12.4h, v16.4s, #5 |
| sqrshrn2 v12.8h, v17.4s, #5 |
| sqrshrn v13.4h, v18.4s, #5 |
| sqrshrn2 v13.8h, v19.4s, #5 |
| smin v16.8h, v10.8h, v26.8h |
| smin v17.8h, v11.8h, v26.8h |
| smin v18.8h, v12.8h, v26.8h |
| smin v19.8h, v13.8h, v26.8h |
| smax v16.8h, v16.8h, v25.8h |
| smax v17.8h, v17.8h, v25.8h |
| smax v18.8h, v18.8h, v25.8h |
| smax v19.8h, v19.8h, v25.8h |
| .endif |
| |
| uxtl v4.8h, v6.8b // scaling |
| .if \ox && !\oy |
| sqrshrn v20.4h, v20.4s, #5 |
| .endif |
| uxtl2 v5.8h, v6.16b |
| .if \ox && !\oy |
| smin v20.4h, v20.4h, v26.4h |
| .endif |
| uxtl v6.8h, v7.8b |
| .if \ox && !\oy |
| smax v20.4h, v20.4h, v25.4h |
| .endif |
| uxtl2 v7.8h, v7.16b |
| .if \ox && !\oy |
| ins v16.d[0], v20.d[0] |
| .endif |
| ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) |
| ushl v5.8h, v5.8h, v29.8h |
| ushl v6.8h, v6.8h, v29.8h |
| ushl v7.8h, v7.8h, v29.8h |
| |
| sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) |
| sqrdmulh v21.8h, v17.8h, v5.8h |
| sqrdmulh v22.8h, v18.8h, v6.8h |
| sqrdmulh v23.8h, v19.8h, v7.8h |
| |
| usqadd v0.8h, v20.8h // *src + noise |
| usqadd v1.8h, v21.8h |
| usqadd v2.8h, v22.8h |
| usqadd v3.8h, v23.8h |
| |
| umax v0.8h, v0.8h, v30.8h |
| umax v1.8h, v1.8h, v30.8h |
| umax v2.8h, v2.8h, v30.8h |
| umax v3.8h, v3.8h, v30.8h |
| umin v0.8h, v0.8h, v31.8h |
| umin v1.8h, v1.8h, v31.8h |
| umin v2.8h, v2.8h, v31.8h |
| umin v3.8h, v3.8h, v31.8h |
| |
| subs w7, w7, #1 |
| .if \oy |
| dup v8.8h, v28.h[0] |
| dup v9.8h, v28.h[1] |
| .endif |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst |
| b.gt 1b |
| |
| .if \oy |
| cmp w10, #2 |
| sub w7, w10, #2 // restore actual remaining h |
| b.gt L(loop_\ox\()0) |
| .endif |
| ldr d14, [sp, #64] |
| ldp d12, d13, [sp, #48] |
| ldp d10, d11, [sp, #32] |
| ldp d8, d9, [sp, #16] |
| ldr x30, [sp], #80 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| .endm |
| |
| fgy 0, 0 |
| fgy 0, 1 |
| fgy 1, 0 |
| fgy 1, 1 |
| |
| L(fgy_loop_tbl): |
| .hword L(fgy_loop_tbl) - L(loop_00) |
| .hword L(fgy_loop_tbl) - L(loop_01) |
| .hword L(fgy_loop_tbl) - L(loop_10) |
| .hword L(fgy_loop_tbl) - L(loop_11) |
| endfunc |
| |
| // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, |
| // const pixel *const src, |
| // const ptrdiff_t stride, |
| // const uint8_t scaling[SCALING_SIZE], |
| // const Dav1dFilmGrainData *const data, |
| // const entry grain_lut[][GRAIN_WIDTH], |
| // const pixel *const luma_row, |
| // const ptrdiff_t luma_stride, |
| // const int offsets[][2], |
| // const ptrdiff_t h, const ptrdiff_t uv, |
| // const ptrdiff_t is_id, |
| // const ptrdiff_t type, |
| // const int bitdepth_max); |
| .macro fguv layout, sx, sy |
| function fguv_32x32_\layout\()_16bpc_neon, export=1 |
| AARCH64_SIGN_LINK_REGISTER |
| str x30, [sp, #-80]! |
| stp d8, d9, [sp, #16] |
| stp d10, d11, [sp, #32] |
| stp d12, d13, [sp, #48] |
| stp d14, d15, [sp, #64] |
| |
| ldp x8, x9, [sp, #80] // offsets, h |
| ldp x10, x11, [sp, #96] // uv, is_id |
| ldr w16, [sp, #120] // bitdepth_max |
| |
| ldr w13, [x4, #FGD_SCALING_SHIFT] |
| ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] |
| dup v23.8h, w16 // bitdepth_max |
| clz w16, w16 |
| eor w13, w13, #15 // 15 - scaling_shift |
| sub w16, w16, #24 // -bitdepth_min_8 |
| |
| // !csfl |
| add x10, x4, x10, lsl #2 // + 4*uv |
| add x14, x10, #FGD_UV_LUMA_MULT |
| add x15, x10, #FGD_UV_MULT |
| add x10, x10, #FGD_UV_OFFSET |
| neg w16, w16 // bitdepth_min_8 |
| ld1r {v8.8h}, [x14] // uv_luma_mult |
| ld1r {v24.8h}, [x10] // uv_offset |
| ld1r {v9.8h}, [x15] // uv_mult |
| |
| dup v29.8h, w13 // 15 - scaling_shift |
| dup v27.8h, w16 // bitdepth_min_8 |
| |
| cbz w12, 1f |
| // clip |
| movi v30.8h, #16 |
| movi v31.8h, #240 |
| sshl v30.8h, v30.8h, v27.8h |
| sshl v31.8h, v31.8h, v27.8h |
| cbz w11, 2f |
| // is_id |
| movi v31.8h, #235 |
| sshl v31.8h, v31.8h, v27.8h |
| b 2f |
| 1: |
| // no clip |
| movi v30.8h, #0 |
| mov v31.16b, v23.16b // bitdepth_max |
| 2: |
| |
| ushr v15.8h, v23.8h, #1 // grain_max |
| sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 |
| not v14.16b, v15.16b // grain_min |
| |
| ldr w12, [x8, #8] // offsets[1][0] |
| ldr w14, [x8, #4] // offsets[0][1] |
| ldr w16, [x8, #12] // offsets[1][1] |
| ldr w8, [x8] // offsets[0][0] |
| |
| mov x10, #GRAIN_WIDTH*2 // grain_lut stride |
| |
| add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 |
| .if \sy |
| add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride |
| add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride |
| .else |
| add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride |
| add x5, x5, x10 // grain_lut += grain_stride |
| .endif |
| |
| calc_offset w12, w13, w12, \sx, \sy |
| calc_offset w14, w15, w14, \sx, \sy |
| calc_offset w16, w17, w16, \sx, \sy |
| calc_offset w8, w11, w8, \sx, \sy |
| |
| add_offset x13, w12, x13, x5, x10 |
| add_offset x15, w14, x15, x5, x10 |
| add_offset x17, w16, x17, x5, x10 |
| add_offset x5, w8, x11, x5, x10 |
| |
| add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx |
| add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by |
| add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by |
| add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx |
| |
| ldr w13, [sp, #112] // type |
| |
| movrel x16, overlap_coeffs_\sx |
| adr x14, L(fguv_loop_sx\sx\()_tbl) |
| |
| ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs |
| tst w13, #1 |
| ldrh w13, [x14, w13, uxtw #1] |
| |
| b.eq 1f |
| // y overlap |
| sub w12, w9, #(2 >> \sy) // backup remaining h |
| mov w9, #(2 >> \sy) |
| |
| 1: |
| sub x13, x14, w13, uxtw |
| |
| .if \sy |
| movi v25.8h, #23 |
| movi v26.8h, #22 |
| .else |
| movi v25.8h, #27 |
| movi v26.8h, #17 |
| .endif |
| |
| .if \sy |
| add x7, x7, x7 // luma_stride *= 2 |
| .endif |
| |
| br x13 |
| endfunc |
| .endm |
| |
| fguv 420, 1, 1 |
| fguv 422, 1, 0 |
| fguv 444, 0, 0 |
| |
| function fguv_loop_sx0_neon |
| .macro fguv_loop_sx0 csfl, ox, oy |
| L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): |
| AARCH64_VALID_JUMP_TARGET |
| 1: |
| .if \ox |
| ld1 {v4.4h}, [x4], x10 // grain_lut old |
| .endif |
| .if \oy |
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top |
| .endif |
| .if \ox && \oy |
| ld1 {v5.4h}, [x11], x10 // grain_lut top old |
| .endif |
| ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut |
| |
| .if \ox |
| smull v4.4s, v4.4h, v27.4h |
| smlal v4.4s, v16.4h, v28.4h |
| .endif |
| |
| .if \oy |
| .if \ox |
| smull v5.4s, v5.4h, v27.4h |
| smlal v5.4s, v0.4h, v28.4h |
| sqrshrn v4.4h, v4.4s, #5 |
| sqrshrn v5.4h, v5.4s, #5 |
| smin v4.4h, v4.4h, v15.4h |
| smin v5.4h, v5.4h, v15.4h |
| smax v4.4h, v4.4h, v14.4h |
| smax v5.4h, v5.4h, v14.4h |
| ins v16.d[0], v4.d[0] |
| ins v0.d[0], v5.d[0] |
| .endif |
| |
| smull v6.4s, v16.4h, v26.4h |
| smull2 v7.4s, v16.8h, v26.8h |
| smull v10.4s, v17.4h, v26.4h |
| smull2 v11.4s, v17.8h, v26.8h |
| smull v16.4s, v18.4h, v26.4h |
| smull2 v17.4s, v18.8h, v26.8h |
| smull v18.4s, v19.4h, v26.4h |
| smull2 v19.4s, v19.8h, v26.8h |
| smlal v6.4s, v0.4h, v25.4h |
| smlal2 v7.4s, v0.8h, v25.8h |
| smlal v10.4s, v1.4h, v25.4h |
| smlal2 v11.4s, v1.8h, v25.8h |
| smlal v16.4s, v2.4h, v25.4h |
| smlal2 v17.4s, v2.8h, v25.8h |
| smlal v18.4s, v3.4h, v25.4h |
| smlal2 v19.4s, v3.8h, v25.8h |
| sqrshrn v6.4h, v6.4s, #5 |
| sqrshrn2 v6.8h, v7.4s, #5 |
| sqrshrn v7.4h, v10.4s, #5 |
| sqrshrn2 v7.8h, v11.4s, #5 |
| sqrshrn v10.4h, v16.4s, #5 |
| sqrshrn2 v10.8h, v17.4s, #5 |
| sqrshrn v11.4h, v18.4s, #5 |
| sqrshrn2 v11.8h, v19.4s, #5 |
| .endif |
| |
| .if \ox && !\oy |
| sqrshrn v4.4h, v4.4s, #5 |
| smin v4.4h, v4.4h, v15.4h |
| .endif |
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma |
| .if \oy |
| smin v16.8h, v6.8h, v15.8h |
| smin v17.8h, v7.8h, v15.8h |
| smin v18.8h, v10.8h, v15.8h |
| smin v19.8h, v11.8h, v15.8h |
| smax v16.8h, v16.8h, v14.8h |
| smax v17.8h, v17.8h, v14.8h |
| smax v18.8h, v18.8h, v14.8h |
| smax v19.8h, v19.8h, v14.8h |
| .endif |
| |
| .if \ox && !\oy |
| smax v4.4h, v4.4h, v14.4h |
| .endif |
| ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src |
| .if \ox && !\oy |
| ins v16.d[0], v4.d[0] |
| .endif |
| |
| .if !\csfl |
| smull v4.4s, v0.4h, v8.4h |
| smull2 v5.4s, v0.8h, v8.8h |
| smull v6.4s, v1.4h, v8.4h |
| smull2 v7.4s, v1.8h, v8.8h |
| smull v0.4s, v2.4h, v8.4h |
| smull2 v1.4s, v2.8h, v8.8h |
| smull v2.4s, v3.4h, v8.4h |
| smull2 v3.4s, v3.8h, v8.8h |
| smlal v4.4s, v10.4h, v9.4h |
| smlal2 v5.4s, v10.8h, v9.8h |
| smlal v6.4s, v11.4h, v9.4h |
| smlal2 v7.4s, v11.8h, v9.8h |
| smlal v0.4s, v12.4h, v9.4h |
| smlal2 v1.4s, v12.8h, v9.8h |
| smlal v2.4s, v13.4h, v9.4h |
| smlal2 v3.4s, v13.8h, v9.8h |
| shrn v4.4h, v4.4s, #6 |
| shrn2 v4.8h, v5.4s, #6 |
| shrn v5.4h, v6.4s, #6 |
| shrn2 v5.8h, v7.4s, #6 |
| shrn v6.4h, v0.4s, #6 |
| shrn2 v6.8h, v1.4s, #6 |
| shrn v7.4h, v2.4s, #6 |
| shrn2 v7.8h, v3.4s, #6 |
| add v0.8h, v4.8h, v24.8h |
| add v1.8h, v5.8h, v24.8h |
| add v2.8h, v6.8h, v24.8h |
| add v3.8h, v7.8h, v24.8h |
| movi v20.8h, #0 |
| smin v0.8h, v0.8h, v23.8h |
| smin v1.8h, v1.8h, v23.8h |
| smin v2.8h, v2.8h, v23.8h |
| smin v3.8h, v3.8h, v23.8h |
| smax v0.8h, v0.8h, v20.8h |
| smax v1.8h, v1.8h, v20.8h |
| smax v2.8h, v2.8h, v20.8h |
| smax v3.8h, v3.8h, v20.8h |
| .else |
| // Make sure that uninitialized pixels out of range past the right |
| // edge are in range; their actual values shouldn't matter. |
| and v0.16b, v0.16b, v23.16b |
| and v1.16b, v1.16b, v23.16b |
| and v2.16b, v2.16b, v23.16b |
| and v3.16b, v3.16b, v23.16b |
| .endif |
| |
| bl gather32_neon |
| |
| uxtl v4.8h, v6.8b // scaling |
| uxtl2 v5.8h, v6.16b |
| uxtl v6.8h, v7.8b |
| uxtl2 v7.8h, v7.16b |
| |
| ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) |
| ushl v5.8h, v5.8h, v29.8h |
| ushl v6.8h, v6.8h, v29.8h |
| ushl v7.8h, v7.8h, v29.8h |
| |
| sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) |
| sqrdmulh v17.8h, v17.8h, v5.8h |
| sqrdmulh v18.8h, v18.8h, v6.8h |
| sqrdmulh v19.8h, v19.8h, v7.8h |
| |
| usqadd v10.8h, v16.8h // *src + noise |
| usqadd v11.8h, v17.8h |
| usqadd v12.8h, v18.8h |
| usqadd v13.8h, v19.8h |
| |
| umax v0.8h, v10.8h, v30.8h |
| umax v1.8h, v11.8h, v30.8h |
| umax v2.8h, v12.8h, v30.8h |
| umax v3.8h, v13.8h, v30.8h |
| umin v0.8h, v0.8h, v31.8h |
| umin v1.8h, v1.8h, v31.8h |
| umin v2.8h, v2.8h, v31.8h |
| umin v3.8h, v3.8h, v31.8h |
| |
| subs w9, w9, #1 |
| .if \oy |
| dup v25.8h, v28.h[0] |
| dup v26.8h, v28.h[1] |
| .endif |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst |
| b.gt 1b |
| |
| .if \oy |
| cmp w12, #0 |
| mov w9, w12 // restore actual remaining h |
| b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) |
| .endif |
| b 9f |
| .endm |
| fguv_loop_sx0 0, 0, 0 |
| fguv_loop_sx0 0, 0, 1 |
| fguv_loop_sx0 0, 1, 0 |
| fguv_loop_sx0 0, 1, 1 |
| fguv_loop_sx0 1, 0, 0 |
| fguv_loop_sx0 1, 0, 1 |
| fguv_loop_sx0 1, 1, 0 |
| fguv_loop_sx0 1, 1, 1 |
| |
| 9: |
| ldp d14, d15, [sp, #64] |
| ldp d12, d13, [sp, #48] |
| ldp d10, d11, [sp, #32] |
| ldp d8, d9, [sp, #16] |
| ldr x30, [sp], #80 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(fguv_loop_sx0_tbl): |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) |
| .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) |
| endfunc |
| |
| function fguv_loop_sx1_neon |
| .macro fguv_loop_sx1 csfl, ox, oy |
| L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): |
| AARCH64_VALID_JUMP_TARGET |
| 1: |
| .if \ox |
| ld1 {v18.4h}, [x4], x10 // grain_lut old |
| .endif |
| .if \oy |
| ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top |
| .endif |
| .if \ox && \oy |
| ld1 {v19.4h}, [x11], x10 // grain_lut top old |
| .endif |
| ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut |
| |
| .if \ox |
| smull v18.4s, v18.4h, v27.4h |
| smlal v18.4s, v16.4h, v28.4h |
| .endif |
| |
| .if \oy |
| .if \ox |
| smull v19.4s, v19.4h, v27.4h |
| smlal v19.4s, v20.4h, v28.4h |
| sqrshrn v18.4h, v18.4s, #5 |
| sqrshrn v19.4h, v19.4s, #5 |
| smin v18.4h, v18.4h, v15.4h |
| smin v19.4h, v19.4h, v15.4h |
| smax v18.4h, v18.4h, v14.4h |
| smax v19.4h, v19.4h, v14.4h |
| ins v16.d[0], v18.d[0] |
| ins v20.d[0], v19.d[0] |
| .endif |
| |
| smull v0.4s, v16.4h, v26.4h |
| smull2 v1.4s, v16.8h, v26.8h |
| smull v2.4s, v17.4h, v26.4h |
| smull2 v3.4s, v17.8h, v26.8h |
| smlal v0.4s, v20.4h, v25.4h |
| smlal2 v1.4s, v20.8h, v25.8h |
| smlal v2.4s, v21.4h, v25.4h |
| smlal2 v3.4s, v21.8h, v25.8h |
| sqrshrn v16.4h, v0.4s, #5 |
| sqrshrn2 v16.8h, v1.4s, #5 |
| sqrshrn v17.4h, v2.4s, #5 |
| sqrshrn2 v17.8h, v3.4s, #5 |
| .endif |
| |
| .if \ox && !\oy |
| sqrshrn v18.4h, v18.4s, #5 |
| smin v18.4h, v18.4h, v15.4h |
| .endif |
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma |
| .if \oy |
| smin v16.8h, v16.8h, v15.8h |
| smin v17.8h, v17.8h, v15.8h |
| smax v16.8h, v16.8h, v14.8h |
| smax v17.8h, v17.8h, v14.8h |
| .endif |
| |
| .if \ox && !\oy |
| smax v18.4h, v18.4h, v14.4h |
| .endif |
| ld1 {v10.8h, v11.8h}, [x1], x2 // src |
| .if \ox && !\oy |
| ins v16.d[0], v18.d[0] |
| .endif |
| addp v0.8h, v0.8h, v1.8h |
| addp v1.8h, v2.8h, v3.8h |
| urshr v0.8h, v0.8h, #1 |
| urshr v1.8h, v1.8h, #1 |
| .if !\csfl |
| smull v2.4s, v0.4h, v8.4h |
| smull2 v3.4s, v0.8h, v8.8h |
| smull v0.4s, v1.4h, v8.4h |
| smull2 v1.4s, v1.8h, v8.8h |
| smlal v2.4s, v10.4h, v9.4h |
| smlal2 v3.4s, v10.8h, v9.8h |
| smlal v0.4s, v11.4h, v9.4h |
| smlal2 v1.4s, v11.8h, v9.8h |
| shrn v2.4h, v2.4s, #6 |
| shrn2 v2.8h, v3.4s, #6 |
| shrn v3.4h, v0.4s, #6 |
| shrn2 v3.8h, v1.4s, #6 |
| add v0.8h, v2.8h, v24.8h |
| add v1.8h, v3.8h, v24.8h |
| movi v2.8h, #0 |
| smin v0.8h, v0.8h, v23.8h |
| smin v1.8h, v1.8h, v23.8h |
| smax v0.8h, v0.8h, v2.8h |
| smax v1.8h, v1.8h, v2.8h |
| .else |
| // Make sure that uninitialized pixels out of range past the right |
| // edge are in range; their actual values shouldn't matter. |
| and v0.16b, v0.16b, v23.16b |
| and v1.16b, v1.16b, v23.16b |
| .endif |
| |
| bl gather16_neon |
| |
| uxtl v4.8h, v6.8b // scaling |
| uxtl2 v5.8h, v6.16b |
| |
| ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) |
| ushl v5.8h, v5.8h, v29.8h |
| |
| sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) |
| sqrdmulh v17.8h, v17.8h, v5.8h |
| |
| usqadd v10.8h, v16.8h // *src + noise |
| usqadd v11.8h, v17.8h |
| |
| umax v0.8h, v10.8h, v30.8h |
| umax v1.8h, v11.8h, v30.8h |
| umin v0.8h, v0.8h, v31.8h |
| umin v1.8h, v1.8h, v31.8h |
| |
| .if \oy |
| mov v16.16b, v25.16b |
| .endif |
| subs w9, w9, #1 |
| .if \oy |
| mov v25.16b, v26.16b |
| mov v26.16b, v16.16b |
| .endif |
| st1 {v0.8h, v1.8h}, [x0], x2 // dst |
| b.gt 1b |
| |
| .if \oy |
| cmp w12, #0 |
| mov w9, w12 // restore actual remaining h |
| b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) |
| .endif |
| |
| b 9f |
| .endm |
| fguv_loop_sx1 0, 0, 0 |
| fguv_loop_sx1 0, 0, 1 |
| fguv_loop_sx1 0, 1, 0 |
| fguv_loop_sx1 0, 1, 1 |
| fguv_loop_sx1 1, 0, 0 |
| fguv_loop_sx1 1, 0, 1 |
| fguv_loop_sx1 1, 1, 0 |
| fguv_loop_sx1 1, 1, 1 |
| |
| 9: |
| ldp d14, d15, [sp, #64] |
| ldp d12, d13, [sp, #48] |
| ldp d10, d11, [sp, #32] |
| ldp d8, d9, [sp, #16] |
| ldr x30, [sp], #80 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(fguv_loop_sx1_tbl): |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) |
| .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) |
| endfunc |