| /* |
| * Copyright © 2019, VideoLAN and dav1d authors |
| * Copyright © 2019, Martin Storsjo |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "src/arm/asm.S" |
| #include "util.S" |
| |
| #define BUF_POS 0 |
| #define BUF_END 8 |
| #define DIF 16 |
| #define RNG 24 |
| #define CNT 28 |
| #define ALLOW_UPDATE_CDF 32 |
| |
| const coeffs |
| .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 |
| .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| endconst |
| |
| const bits |
| .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 |
| .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 |
| endconst |
| |
| .macro ld1_n d0, d1, src, sz, n |
| .if \n <= 8 |
| ld1 {\d0\sz}, [\src] |
| .else |
| ld1 {\d0\sz, \d1\sz}, [\src] |
| .endif |
| .endm |
| |
| .macro st1_n s0, s1, dst, sz, n |
| .if \n <= 8 |
| st1 {\s0\sz}, [\dst] |
| .else |
| st1 {\s0\sz, \s1\sz}, [\dst] |
| .endif |
| .endm |
| |
| .macro ushr_n d0, d1, s0, s1, shift, sz, n |
| ushr \d0\sz, \s0\sz, \shift |
| .if \n == 16 |
| ushr \d1\sz, \s1\sz, \shift |
| .endif |
| .endm |
| |
| .macro add_n d0, d1, s0, s1, s2, s3, sz, n |
| add \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| add \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro sub_n d0, d1, s0, s1, s2, s3, sz, n |
| sub \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| sub \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro and_n d0, d1, s0, s1, s2, s3, sz, n |
| and \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| and \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n |
| cmhs \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| cmhs \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n |
| urhadd \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| urhadd \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n |
| sshl \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| sshl \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n |
| sqdmulh \d0\sz, \s0\sz, \s2\sz |
| .if \n == 16 |
| sqdmulh \d1\sz, \s1\sz, \s3\sz |
| .endif |
| .endm |
| |
| .macro str_n idx0, idx1, dstreg, dstoff, n |
| str \idx0, [\dstreg, \dstoff] |
| .if \n == 16 |
| str \idx1, [\dstreg, \dstoff + 16] |
| .endif |
| .endm |
| |
| // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, |
| // size_t n_symbols); |
| |
| function msac_decode_symbol_adapt4_neon, export=1 |
| .macro decode_update sz, szb, n |
| sub sp, sp, #48 |
| add x8, x0, #RNG |
| ld1_n v0, v1, x1, \sz, \n // cdf |
| ld1r {v4\sz}, [x8] // rng |
| movrel x9, coeffs, 30 |
| movi v31\sz, #0x7f, lsl #8 // 0x7f00 |
| sub x9, x9, x2, lsl #1 |
| mvni v30\sz, #0x3f // 0xffc0 |
| and v7\szb, v4\szb, v31\szb // rng & 0x7f00 |
| str h4, [sp, #14] // store original u = s->rng |
| and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 |
| |
| ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) |
| sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 |
| add x8, x0, #DIF + 6 |
| |
| add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) |
| add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) |
| |
| ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) |
| movrel x8, bits |
| str_n q4, q5, sp, #16, \n // store v values to allow indexed access |
| |
| ld1_n v16, v17, x8, .8h, \n |
| |
| cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v |
| |
| and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask |
| .if \n == 16 |
| add v6.8h, v6.8h, v7.8h |
| .endif |
| addv h6, v6.8h // Aggregate mask bits |
| ldr w4, [x0, #ALLOW_UPDATE_CDF] |
| umov w3, v6.h[0] |
| rbit w3, w3 |
| clz w15, w3 // ret |
| |
| cbz w4, L(renorm) |
| // update_cdf |
| ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] |
| movi v5\szb, #0xff |
| .if \n == 16 |
| mov w4, #-5 |
| .else |
| mvn w14, w2 |
| mov w4, #-4 |
| cmn w14, #3 // set C if n_symbols <= 2 |
| .endif |
| urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 |
| .if \n == 16 |
| sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) |
| .else |
| lsr w14, w3, #4 // count >> 4 |
| sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) |
| .endif |
| sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) |
| dup v6\sz, w4 // -rate |
| |
| sub w3, w3, w3, lsr #5 // count - (count == 32) |
| sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) |
| sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate |
| add w3, w3, #1 // count + (count < 32) |
| add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate |
| st1_n v0, v1, x1, \sz, \n |
| strh w3, [x1, x2, lsl #1] |
| .endm |
| |
| decode_update .4h, .8b, 4 |
| |
| L(renorm): |
| add x8, sp, #16 |
| add x8, x8, w15, uxtw #1 |
| ldrh w3, [x8] // v |
| ldurh w4, [x8, #-2] // u |
| ldr w6, [x0, #CNT] |
| ldr x7, [x0, #DIF] |
| sub w4, w4, w3 // rng = u - v |
| clz w5, w4 // clz(rng) |
| eor w5, w5, #16 // d = clz(rng) ^ 16 |
| mvn x7, x7 // ~dif |
| add x7, x7, x3, lsl #48 // ~dif + (v << 48) |
| L(renorm2): |
| lsl w4, w4, w5 // rng << d |
| subs w6, w6, w5 // cnt -= d |
| lsl x7, x7, x5 // (~dif + (v << 48)) << d |
| str w4, [x0, #RNG] |
| mvn x7, x7 // ~dif |
| b.hs 9f |
| |
| // refill |
| ldp x3, x4, [x0] // BUF_POS, BUF_END |
| add x5, x3, #8 |
| cmp x5, x4 |
| b.gt 2f |
| |
| ldr x3, [x3] // next_bits |
| add w8, w6, #23 // shift_bits = cnt + 23 |
| add w6, w6, #16 // cnt += 16 |
| rev x3, x3 // next_bits = bswap(next_bits) |
| sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 |
| and w8, w8, #24 // shift_bits &= 24 |
| lsr x3, x3, x8 // next_bits >>= shift_bits |
| sub w8, w8, w6 // shift_bits -= 16 + cnt |
| str x5, [x0, #BUF_POS] |
| lsl x3, x3, x8 // next_bits <<= shift_bits |
| mov w4, #48 |
| sub w6, w4, w8 // cnt = cnt + 64 - shift_bits |
| eor x7, x7, x3 // dif ^= next_bits |
| b 9f |
| |
| 2: // refill_eob |
| mov w14, #40 |
| sub w5, w14, w6 // c = 40 - cnt |
| 3: |
| cmp x3, x4 |
| b.ge 4f |
| ldrb w8, [x3], #1 |
| lsl x8, x8, x5 |
| eor x7, x7, x8 |
| subs w5, w5, #8 |
| b.ge 3b |
| |
| 4: // refill_eob_end |
| str x3, [x0, #BUF_POS] |
| sub w6, w14, w5 // cnt = 40 - c |
| |
| 9: |
| str w6, [x0, #CNT] |
| str x7, [x0, #DIF] |
| |
| mov w0, w15 |
| add sp, sp, #48 |
| ret |
| endfunc |
| |
| function msac_decode_symbol_adapt8_neon, export=1 |
| decode_update .8h, .16b, 8 |
| b L(renorm) |
| endfunc |
| |
| function msac_decode_symbol_adapt16_neon, export=1 |
| decode_update .8h, .16b, 16 |
| b L(renorm) |
| endfunc |
| |
| function msac_decode_hi_tok_neon, export=1 |
| ld1 {v0.4h}, [x1] // cdf |
| add x16, x0, #RNG |
| movi v31.4h, #0x7f, lsl #8 // 0x7f00 |
| movrel x17, coeffs, 30-2*3 |
| mvni v30.4h, #0x3f // 0xffc0 |
| ldrh w9, [x1, #6] // count = cdf[n_symbols] |
| ld1r {v3.4h}, [x16] // rng |
| movrel x16, bits |
| ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) |
| add x17, x0, #DIF + 6 |
| ld1 {v16.8h}, [x16] |
| mov w13, #-24 |
| and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 |
| ldr w10, [x0, #ALLOW_UPDATE_CDF] |
| ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) |
| sub sp, sp, #48 |
| ldr w6, [x0, #CNT] |
| ldr x7, [x0, #DIF] |
| 1: |
| and v7.8b, v3.8b, v31.8b // rng & 0x7f00 |
| sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 |
| add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) |
| add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) |
| str h3, [sp, #14] // store original u = s->rng |
| cmhs v2.8h, v1.8h, v4.8h // c >= v |
| str q4, [sp, #16] // store v values to allow indexed access |
| and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask |
| addv h6, v6.8h // Aggregate mask bits |
| umov w3, v6.h[0] |
| add w13, w13, #5 |
| rbit w3, w3 |
| add x8, sp, #16 |
| clz w15, w3 // ret |
| |
| cbz w10, 2f |
| // update_cdf |
| movi v5.8b, #0xff |
| mov w4, #-5 |
| urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 |
| sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) |
| sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) |
| dup v6.4h, w4 // -rate |
| |
| sub w9, w9, w9, lsr #5 // count - (count == 32) |
| sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) |
| sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate |
| add w9, w9, #1 // count + (count < 32) |
| add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate |
| st1 {v0.4h}, [x1] |
| and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 |
| strh w9, [x1, #6] |
| |
| 2: |
| add x8, x8, w15, uxtw #1 |
| ldrh w3, [x8] // v |
| ldurh w4, [x8, #-2] // u |
| sub w4, w4, w3 // rng = u - v |
| clz w5, w4 // clz(rng) |
| eor w5, w5, #16 // d = clz(rng) ^ 16 |
| mvn x7, x7 // ~dif |
| add x7, x7, x3, lsl #48 // ~dif + (v << 48) |
| lsl w4, w4, w5 // rng << d |
| subs w6, w6, w5 // cnt -= d |
| lsl x7, x7, x5 // (~dif + (v << 48)) << d |
| str w4, [x0, #RNG] |
| dup v3.4h, w4 |
| mvn x7, x7 // ~dif |
| b.hs 9f |
| |
| // refill |
| ldp x3, x4, [x0] // BUF_POS, BUF_END |
| add x5, x3, #8 |
| cmp x5, x4 |
| b.gt 2f |
| |
| ldr x3, [x3] // next_bits |
| add w8, w6, #23 // shift_bits = cnt + 23 |
| add w6, w6, #16 // cnt += 16 |
| rev x3, x3 // next_bits = bswap(next_bits) |
| sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 |
| and w8, w8, #24 // shift_bits &= 24 |
| lsr x3, x3, x8 // next_bits >>= shift_bits |
| sub w8, w8, w6 // shift_bits -= 16 + cnt |
| str x5, [x0, #BUF_POS] |
| lsl x3, x3, x8 // next_bits <<= shift_bits |
| mov w4, #48 |
| sub w6, w4, w8 // cnt = cnt + 64 - shift_bits |
| eor x7, x7, x3 // dif ^= next_bits |
| b 9f |
| |
| 2: // refill_eob |
| mov w14, #40 |
| sub w5, w14, w6 // c = 40 - cnt |
| 3: |
| cmp x3, x4 |
| b.ge 4f |
| ldrb w8, [x3], #1 |
| lsl x8, x8, x5 |
| eor x7, x7, x8 |
| subs w5, w5, #8 |
| b.ge 3b |
| |
| 4: // refill_eob_end |
| str x3, [x0, #BUF_POS] |
| sub w6, w14, w5 // cnt = 40 - c |
| |
| 9: |
| lsl w15, w15, #1 |
| sub w15, w15, #5 |
| lsr x12, x7, #48 |
| adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 |
| dup v1.8h, w12 |
| b.cc 1b // loop if !carry |
| add w13, w13, #30 |
| str w6, [x0, #CNT] |
| add sp, sp, #48 |
| str x7, [x0, #DIF] |
| lsr w0, w13, #1 |
| ret |
| endfunc |
| |
| function msac_decode_bool_equi_neon, export=1 |
| ldp w5, w6, [x0, #RNG] // + CNT |
| sub sp, sp, #48 |
| ldr x7, [x0, #DIF] |
| bic w4, w5, #0xff // r &= 0xff00 |
| add w4, w4, #8 |
| subs x8, x7, x4, lsl #47 // dif - vw |
| lsr w4, w4, #1 // v |
| sub w5, w5, w4 // r - v |
| cset w15, lo |
| csel w4, w5, w4, hs // if (ret) v = r - v; |
| csel x7, x8, x7, hs // if (ret) dif = dif - vw; |
| |
| clz w5, w4 // clz(rng) |
| mvn x7, x7 // ~dif |
| eor w5, w5, #16 // d = clz(rng) ^ 16 |
| b L(renorm2) |
| endfunc |
| |
| function msac_decode_bool_neon, export=1 |
| ldp w5, w6, [x0, #RNG] // + CNT |
| sub sp, sp, #48 |
| ldr x7, [x0, #DIF] |
| lsr w4, w5, #8 // r >> 8 |
| bic w1, w1, #0x3f // f &= ~63 |
| mul w4, w4, w1 |
| lsr w4, w4, #7 |
| add w4, w4, #4 // v |
| subs x8, x7, x4, lsl #48 // dif - vw |
| sub w5, w5, w4 // r - v |
| cset w15, lo |
| csel w4, w5, w4, hs // if (ret) v = r - v; |
| csel x7, x8, x7, hs // if (ret) dif = dif - vw; |
| |
| clz w5, w4 // clz(rng) |
| mvn x7, x7 // ~dif |
| eor w5, w5, #16 // d = clz(rng) ^ 16 |
| b L(renorm2) |
| endfunc |
| |
| function msac_decode_bool_adapt_neon, export=1 |
| ldr w9, [x1] // cdf[0-1] |
| ldp w5, w6, [x0, #RNG] // + CNT |
| sub sp, sp, #48 |
| ldr x7, [x0, #DIF] |
| lsr w4, w5, #8 // r >> 8 |
| and w2, w9, #0xffc0 // f &= ~63 |
| mul w4, w4, w2 |
| lsr w4, w4, #7 |
| add w4, w4, #4 // v |
| subs x8, x7, x4, lsl #48 // dif - vw |
| sub w5, w5, w4 // r - v |
| cset w15, lo |
| csel w4, w5, w4, hs // if (ret) v = r - v; |
| csel x7, x8, x7, hs // if (ret) dif = dif - vw; |
| |
| ldr w10, [x0, #ALLOW_UPDATE_CDF] |
| |
| clz w5, w4 // clz(rng) |
| mvn x7, x7 // ~dif |
| eor w5, w5, #16 // d = clz(rng) ^ 16 |
| |
| cbz w10, L(renorm2) |
| |
| lsr w2, w9, #16 // count = cdf[1] |
| and w9, w9, #0xffff // cdf[0] |
| |
| sub w3, w2, w2, lsr #5 // count - (count >= 32) |
| lsr w2, w2, #4 // count >> 4 |
| add w10, w3, #1 // count + (count < 32) |
| add w2, w2, #4 // rate = (count >> 4) | 4 |
| |
| sub w9, w9, w15 // cdf[0] -= bit |
| sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} |
| asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate |
| sub w9, w9, w11 // cdf[0] |
| |
| strh w9, [x1] |
| strh w10, [x1, #2] |
| |
| b L(renorm2) |
| endfunc |