third_party/libdav1d/src/arm/64/msac.S - cobalt - Git at Google

 /*
  * Copyright © 2019, VideoLAN and dav1d authors
  * Copyright © 2019, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "src/arm/asm.S"
 #include "util.S"

 #define BUF_POS 0
 #define BUF_END 8
 #define DIF 16
 #define RNG 24
 #define CNT 28
 #define ALLOW_UPDATE_CDF 32

 const coeffs
         .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
         .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
 endconst

 const bits
         .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
         .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
 endconst

 .macro ld1_n d0, d1, src, sz, n
 .if \n <= 8
         ld1             {\d0\sz},  [\src]
 .else
         ld1             {\d0\sz, \d1\sz},  [\src]
 .endif
 .endm

 .macro st1_n s0, s1, dst, sz, n
 .if \n <= 8
         st1             {\s0\sz},  [\dst]
 .else
         st1             {\s0\sz, \s1\sz},  [\dst]
 .endif
 .endm

 .macro ushr_n d0, d1, s0, s1, shift, sz, n
         ushr            \d0\sz,  \s0\sz,  \shift
 .if \n == 16
         ushr            \d1\sz,  \s1\sz,  \shift
 .endif
 .endm

 .macro add_n d0, d1, s0, s1, s2, s3, sz, n
         add             \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         add             \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro sub_n d0, d1, s0, s1, s2, s3, sz, n
         sub             \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         sub             \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro and_n d0, d1, s0, s1, s2, s3, sz, n
         and             \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         and             \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
         cmhs            \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         cmhs            \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
         urhadd          \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         urhadd          \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
         sshl            \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         sshl            \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
         sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
         sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm

 .macro str_n            idx0, idx1, dstreg, dstoff, n
         str             \idx0,  [\dstreg, \dstoff]
 .if \n == 16
         str             \idx1,  [\dstreg, \dstoff + 16]
 .endif
 .endm

 // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
 //                                               size_t n_symbols);

 function msac_decode_symbol_adapt4_neon, export=1
 .macro decode_update sz, szb, n
         sub             sp,  sp,  #48
         add             x8,  x0,  #RNG
         ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
         ld1r            {v4\sz},  [x8]                            // rng
         movrel          x9,  coeffs, 30
         movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
         sub             x9,  x9,  x2, lsl #1
         mvni            v30\sz, #0x3f                             // 0xffc0
         and             v7\szb, v4\szb, v31\szb                   // rng & 0x7f00
         str             h4,  [sp, #14]                            // store original u = s->rng
         and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0

         ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
         sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
         add             x8,  x0,  #DIF + 6

         add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
         add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

         ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
         movrel          x8,  bits
         str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access

         ld1_n           v16, v17, x8,  .8h, \n

         cmhs_n          v2,  v3,  v6,  v6,  v4,  v5,  .8h,  \n    // c >= v

         and_n           v6,  v7,  v2,  v3,  v16, v17, .16b, \n    // One bit per halfword set in the mask
 .if \n == 16
         add             v6.8h,  v6.8h,  v7.8h
 .endif
         addv            h6,  v6.8h                                // Aggregate mask bits
         ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
         umov            w3,  v6.h[0]
         rbit            w3,  w3
         clz             w15, w3                                   // ret

         cbz             w4,  L(renorm)
         // update_cdf
         ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
         movi            v5\szb, #0xff
 .if \n == 16
         mov             w4,  #-5
 .else
         mvn             w14, w2
         mov             w4,  #-4
         cmn             w14, #3                                   // set C if n_symbols <= 2
 .endif
         urhadd_n        v4,  v5,  v5,  v5,  v2,  v3,  \sz, \n     // i >= val ? -1 : 32768
 .if \n == 16
         sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
 .else
         lsr             w14, w3,  #4                              // count >> 4
         sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
 .endif
         sub_n           v4,  v5,  v4,  v5,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
         dup             v6\sz,    w4                              // -rate

         sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
         sub_n           v0,  v1,  v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
         sshl_n          v4,  v5,  v4,  v5,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
         add             w3,  w3,  #1                              // count + (count < 32)
         add_n           v0,  v1,  v0,  v1,  v4,  v5,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
         st1_n           v0,  v1,  x1,  \sz, \n
         strh            w3,  [x1, x2, lsl #1]
 .endm

         decode_update   .4h, .8b, 4

 L(renorm):
         add             x8,  sp,  #16
         add             x8,  x8,  w15, uxtw #1
         ldrh            w3,  [x8]              // v
         ldurh           w4,  [x8, #-2]         // u
         ldr             w6,  [x0, #CNT]
         ldr             x7,  [x0, #DIF]
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         mvn             x7,  x7                // ~dif
         add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
 L(renorm2):
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
         lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
         str             w4,  [x0, #RNG]
         mvn             x7,  x7                // ~dif
         b.hs            9f

         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
         cmp             x5,  x4
         b.gt            2f

         ldr             x3,  [x3]              // next_bits
         add             w8,  w6,  #23          // shift_bits = cnt + 23
         add             w6,  w6,  #16          // cnt += 16
         rev             x3,  x3                // next_bits = bswap(next_bits)
         sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
         and             w8,  w8,  #24          // shift_bits &= 24
         lsr             x3,  x3,  x8           // next_bits >>= shift_bits
         sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
         str             x5,  [x0, #BUF_POS]
         lsl             x3,  x3,  x8           // next_bits <<= shift_bits
         mov             w4,  #48
         sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
         eor             x7,  x7,  x3           // dif ^= next_bits
         b               9f

 2:      // refill_eob
         mov             w14, #40
         sub             w5,  w14, w6           // c = 40 - cnt
 3:
         cmp             x3,  x4
         b.ge            4f
         ldrb            w8,  [x3], #1
         lsl             x8,  x8,  x5
         eor             x7,  x7,  x8
         subs            w5,  w5,  #8
         b.ge            3b

 4:      // refill_eob_end
         str             x3,  [x0, #BUF_POS]
         sub             w6,  w14, w5           // cnt = 40 - c

 9:
         str             w6,  [x0, #CNT]
         str             x7,  [x0, #DIF]

         mov             w0,  w15
         add             sp,  sp,  #48
         ret
 endfunc

 function msac_decode_symbol_adapt8_neon, export=1
         decode_update   .8h, .16b, 8
         b               L(renorm)
 endfunc

 function msac_decode_symbol_adapt16_neon, export=1
         decode_update   .8h, .16b, 16
         b               L(renorm)
 endfunc

 function msac_decode_hi_tok_neon, export=1
         ld1             {v0.4h},  [x1]            // cdf
         add             x16, x0,  #RNG
         movi            v31.4h, #0x7f, lsl #8     // 0x7f00
         movrel          x17, coeffs, 30-2*3
         mvni            v30.4h, #0x3f             // 0xffc0
         ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
         ld1r            {v3.4h},  [x16]           // rng
         movrel          x16, bits
         ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
         add             x17, x0,  #DIF + 6
         ld1             {v16.8h}, [x16]
         mov             w13, #-24
         and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
         ldr             w10, [x0, #ALLOW_UPDATE_CDF]
         ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
         sub             sp,  sp,  #48
         ldr             w6,  [x0, #CNT]
         ldr             x7,  [x0, #DIF]
 1:
         and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
         sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
         add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
         add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
         str             h3,  [sp, #14]            // store original u = s->rng
         cmhs            v2.8h,   v1.8h,   v4.8h   // c >= v
         str             q4,  [sp, #16]            // store v values to allow indexed access
         and             v6.16b,  v2.16b,  v16.16b // One bit per halfword set in the mask
         addv            h6,  v6.8h                // Aggregate mask bits
         umov            w3,  v6.h[0]
         add             w13, w13, #5
         rbit            w3,  w3
         add             x8,  sp,  #16
         clz             w15, w3                   // ret

         cbz             w10, 2f
         // update_cdf
         movi            v5.8b, #0xff
         mov             w4,  #-5
         urhadd          v4.4h,   v5.4h,   v2.4h   // i >= val ? -1 : 32768
         sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
         sub             v4.4h,   v4.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
         dup             v6.4h,    w4              // -rate

         sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
         sub             v0.4h,   v0.4h,   v2.4h   // cdf + (i >= val ? 1 : 0)
         sshl            v4.4h,   v4.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
         add             w9,  w9,  #1              // count + (count < 32)
         add             v0.4h,   v0.4h,   v4.4h   // cdf + (32768 - cdf[i]) >> rate
         st1             {v0.4h},  [x1]
         and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
         strh            w9,  [x1, #6]

 2:
         add             x8,  x8,  w15, uxtw #1
         ldrh            w3,  [x8]              // v
         ldurh           w4,  [x8, #-2]         // u
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         mvn             x7,  x7                // ~dif
         add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
         lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
         str             w4,  [x0, #RNG]
         dup             v3.4h,   w4
         mvn             x7,  x7                // ~dif
         b.hs            9f

         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
         cmp             x5,  x4
         b.gt            2f

         ldr             x3,  [x3]              // next_bits
         add             w8,  w6,  #23          // shift_bits = cnt + 23
         add             w6,  w6,  #16          // cnt += 16
         rev             x3,  x3                // next_bits = bswap(next_bits)
         sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
         and             w8,  w8,  #24          // shift_bits &= 24
         lsr             x3,  x3,  x8           // next_bits >>= shift_bits
         sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
         str             x5,  [x0, #BUF_POS]
         lsl             x3,  x3,  x8           // next_bits <<= shift_bits
         mov             w4,  #48
         sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
         eor             x7,  x7,  x3           // dif ^= next_bits
         b               9f

 2:      // refill_eob
         mov             w14, #40
         sub             w5,  w14, w6           // c = 40 - cnt
 3:
         cmp             x3,  x4
         b.ge            4f
         ldrb            w8,  [x3], #1
         lsl             x8,  x8,  x5
         eor             x7,  x7,  x8
         subs            w5,  w5,  #8
         b.ge            3b

 4:      // refill_eob_end
         str             x3,  [x0, #BUF_POS]
         sub             w6,  w14, w5           // cnt = 40 - c

 9:
         lsl             w15, w15, #1
         sub             w15, w15, #5
         lsr             x12, x7,  #48
         adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
         dup             v1.8h,   w12
         b.cc            1b                     // loop if !carry
         add             w13, w13, #30
         str             w6,  [x0, #CNT]
         add             sp,  sp,  #48
         str             x7,  [x0, #DIF]
         lsr             w0,  w13, #1
         ret
 endfunc

 function msac_decode_bool_equi_neon, export=1
         ldp             w5,  w6,  [x0, #RNG]   // + CNT
         sub             sp,  sp,  #48
         ldr             x7,  [x0, #DIF]
         bic             w4,  w5,  #0xff        // r &= 0xff00
         add             w4,  w4,  #8
         subs            x8,  x7,  x4, lsl #47  // dif - vw
         lsr             w4,  w4,  #1           // v
         sub             w5,  w5,  w4           // r - v
         cset            w15, lo
         csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

         clz             w5,  w4                // clz(rng)
         mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc

 function msac_decode_bool_neon, export=1
         ldp             w5,  w6,  [x0, #RNG]   // + CNT
         sub             sp,  sp,  #48
         ldr             x7,  [x0, #DIF]
         lsr             w4,  w5,  #8           // r >> 8
         bic             w1,  w1,  #0x3f        // f &= ~63
         mul             w4,  w4,  w1
         lsr             w4,  w4,  #7
         add             w4,  w4,  #4           // v
         subs            x8,  x7,  x4, lsl #48  // dif - vw
         sub             w5,  w5,  w4           // r - v
         cset            w15, lo
         csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

         clz             w5,  w4                // clz(rng)
         mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc

 function msac_decode_bool_adapt_neon, export=1
         ldr             w9,  [x1]              // cdf[0-1]
         ldp             w5,  w6,  [x0, #RNG]   // + CNT
         sub             sp,  sp,  #48
         ldr             x7,  [x0, #DIF]
         lsr             w4,  w5,  #8           // r >> 8
         and             w2,  w9,  #0xffc0      // f &= ~63
         mul             w4,  w4,  w2
         lsr             w4,  w4,  #7
         add             w4,  w4,  #4           // v
         subs            x8,  x7,  x4, lsl #48  // dif - vw
         sub             w5,  w5,  w4           // r - v
         cset            w15, lo
         csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

         ldr             w10, [x0, #ALLOW_UPDATE_CDF]

         clz             w5,  w4                // clz(rng)
         mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16

         cbz             w10, L(renorm2)

         lsr             w2,  w9,  #16          // count = cdf[1]
         and             w9,  w9,  #0xffff      // cdf[0]

         sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
         lsr             w2,  w2,  #4           // count >> 4
         add             w10, w3,  #1           // count + (count < 32)
         add             w2,  w2,  #4           // rate = (count >> 4) | 4

         sub             w9,  w9,  w15          // cdf[0] -= bit
         sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
         asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
         sub             w9,  w9,  w11          // cdf[0]

         strh            w9,  [x1]
         strh            w10, [x1, #2]

         b               L(renorm2)
 endfunc
	/*
	* Copyright © 2019, VideoLAN and dav1d authors
	* Copyright © 2019, Martin Storsjo
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "src/arm/asm.S"
	#include "util.S"

	#define BUF_POS 0
	#define BUF_END 8
	#define DIF 16
	#define RNG 24
	#define CNT 28
	#define ALLOW_UPDATE_CDF 32

	const coeffs
	.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
	.short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	endconst

	const bits
	.short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
	.short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
	endconst

	.macro ld1_n d0, d1, src, sz, n
	.if \n <= 8
	ld1 {\d0\sz}, [\src]
	.else
	ld1 {\d0\sz, \d1\sz}, [\src]
	.endif
	.endm

	.macro st1_n s0, s1, dst, sz, n
	.if \n <= 8
	st1 {\s0\sz}, [\dst]
	.else
	st1 {\s0\sz, \s1\sz}, [\dst]
	.endif
	.endm

	.macro ushr_n d0, d1, s0, s1, shift, sz, n
	ushr \d0\sz, \s0\sz, \shift
	.if \n == 16
	ushr \d1\sz, \s1\sz, \shift
	.endif
	.endm

	.macro add_n d0, d1, s0, s1, s2, s3, sz, n
	add \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	add \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
	sub \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	sub \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro and_n d0, d1, s0, s1, s2, s3, sz, n
	and \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	and \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
	cmhs \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	cmhs \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
	urhadd \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	urhadd \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
	sshl \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	sshl \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
	sqdmulh \d0\sz, \s0\sz, \s2\sz
	.if \n == 16
	sqdmulh \d1\sz, \s1\sz, \s3\sz
	.endif
	.endm

	.macro str_n idx0, idx1, dstreg, dstoff, n
	str \idx0, [\dstreg, \dstoff]
	.if \n == 16
	str \idx1, [\dstreg, \dstoff + 16]
	.endif
	.endm

	// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext s, uint16_t cdf,
	// size_t n_symbols);

	function msac_decode_symbol_adapt4_neon, export=1
	.macro decode_update sz, szb, n
	sub sp, sp, #48
	add x8, x0, #RNG
	ld1_n v0, v1, x1, \sz, \n // cdf
	ld1r {v4\sz}, [x8] // rng
	movrel x9, coeffs, 30
	movi v31\sz, #0x7f, lsl #8 // 0x7f00
	sub x9, x9, x2, lsl #1
	mvni v30\sz, #0x3f // 0xffc0
	and v7\szb, v4\szb, v31\szb // rng & 0x7f00
	str h4, [sp, #14] // store original u = s->rng
	and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0

	ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
	sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
	add x8, x0, #DIF + 6

	add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
	add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

	ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
	movrel x8, bits
	str_n q4, q5, sp, #16, \n // store v values to allow indexed access

	ld1_n v16, v17, x8, .8h, \n

	cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v

	and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
	.if \n == 16
	add v6.8h, v6.8h, v7.8h
	.endif
	addv h6, v6.8h // Aggregate mask bits
	ldr w4, [x0, #ALLOW_UPDATE_CDF]
	umov w3, v6.h[0]
	rbit w3, w3
	clz w15, w3 // ret

	cbz w4, L(renorm)
	// update_cdf
	ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
	movi v5\szb, #0xff
	.if \n == 16
	mov w4, #-5
	.else
	mvn w14, w2
	mov w4, #-4
	cmn w14, #3 // set C if n_symbols <= 2
	.endif
	urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
	.if \n == 16
	sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
	.else
	lsr w14, w3, #4 // count >> 4
	sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
	.endif
	sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
	dup v6\sz, w4 // -rate

	sub w3, w3, w3, lsr #5 // count - (count == 32)
	sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
	sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
	add w3, w3, #1 // count + (count < 32)
	add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
	st1_n v0, v1, x1, \sz, \n
	strh w3, [x1, x2, lsl #1]
	.endm

	decode_update .4h, .8b, 4

	L(renorm):
	add x8, sp, #16
	add x8, x8, w15, uxtw #1
	ldrh w3, [x8] // v
	ldurh w4, [x8, #-2] // u
	ldr w6, [x0, #CNT]
	ldr x7, [x0, #DIF]
	sub w4, w4, w3 // rng = u - v
	clz w5, w4 // clz(rng)
	eor w5, w5, #16 // d = clz(rng) ^ 16
	mvn x7, x7 // ~dif
	add x7, x7, x3, lsl #48 // ~dif + (v << 48)
	L(renorm2):
	lsl w4, w4, w5 // rng << d
	subs w6, w6, w5 // cnt -= d
	lsl x7, x7, x5 // (~dif + (v << 48)) << d
	str w4, [x0, #RNG]
	mvn x7, x7 // ~dif
	b.hs 9f

	// refill
	ldp x3, x4, [x0] // BUF_POS, BUF_END
	add x5, x3, #8
	cmp x5, x4
	b.gt 2f

	ldr x3, [x3] // next_bits
	add w8, w6, #23 // shift_bits = cnt + 23
	add w6, w6, #16 // cnt += 16
	rev x3, x3 // next_bits = bswap(next_bits)
	sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
	and w8, w8, #24 // shift_bits &= 24
	lsr x3, x3, x8 // next_bits >>= shift_bits
	sub w8, w8, w6 // shift_bits -= 16 + cnt
	str x5, [x0, #BUF_POS]
	lsl x3, x3, x8 // next_bits <<= shift_bits
	mov w4, #48
	sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
	eor x7, x7, x3 // dif ^= next_bits
	b 9f

	2: // refill_eob
	mov w14, #40
	sub w5, w14, w6 // c = 40 - cnt
	3:
	cmp x3, x4
	b.ge 4f
	ldrb w8, [x3], #1
	lsl x8, x8, x5
	eor x7, x7, x8
	subs w5, w5, #8
	b.ge 3b

	4: // refill_eob_end
	str x3, [x0, #BUF_POS]
	sub w6, w14, w5 // cnt = 40 - c

	9:
	str w6, [x0, #CNT]
	str x7, [x0, #DIF]

	mov w0, w15
	add sp, sp, #48
	ret
	endfunc

	function msac_decode_symbol_adapt8_neon, export=1
	decode_update .8h, .16b, 8
	b L(renorm)
	endfunc

	function msac_decode_symbol_adapt16_neon, export=1
	decode_update .8h, .16b, 16
	b L(renorm)
	endfunc

	function msac_decode_hi_tok_neon, export=1
	ld1 {v0.4h}, [x1] // cdf
	add x16, x0, #RNG
	movi v31.4h, #0x7f, lsl #8 // 0x7f00
	movrel x17, coeffs, 30-2*3
	mvni v30.4h, #0x3f // 0xffc0
	ldrh w9, [x1, #6] // count = cdf[n_symbols]
	ld1r {v3.4h}, [x16] // rng
	movrel x16, bits
	ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
	add x17, x0, #DIF + 6
	ld1 {v16.8h}, [x16]
	mov w13, #-24
	and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
	ldr w10, [x0, #ALLOW_UPDATE_CDF]
	ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
	sub sp, sp, #48
	ldr w6, [x0, #CNT]
	ldr x7, [x0, #DIF]
	1:
	and v7.8b, v3.8b, v31.8b // rng & 0x7f00
	sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
	add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
	add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
	str h3, [sp, #14] // store original u = s->rng
	cmhs v2.8h, v1.8h, v4.8h // c >= v
	str q4, [sp, #16] // store v values to allow indexed access
	and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
	addv h6, v6.8h // Aggregate mask bits
	umov w3, v6.h[0]
	add w13, w13, #5
	rbit w3, w3
	add x8, sp, #16
	clz w15, w3 // ret

	cbz w10, 2f
	// update_cdf
	movi v5.8b, #0xff
	mov w4, #-5
	urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
	sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
	sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
	dup v6.4h, w4 // -rate

	sub w9, w9, w9, lsr #5 // count - (count == 32)
	sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
	sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
	add w9, w9, #1 // count + (count < 32)
	add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
	st1 {v0.4h}, [x1]
	and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
	strh w9, [x1, #6]

	2:
	add x8, x8, w15, uxtw #1
	ldrh w3, [x8] // v
	ldurh w4, [x8, #-2] // u
	sub w4, w4, w3 // rng = u - v
	clz w5, w4 // clz(rng)
	eor w5, w5, #16 // d = clz(rng) ^ 16
	mvn x7, x7 // ~dif
	add x7, x7, x3, lsl #48 // ~dif + (v << 48)
	lsl w4, w4, w5 // rng << d
	subs w6, w6, w5 // cnt -= d
	lsl x7, x7, x5 // (~dif + (v << 48)) << d
	str w4, [x0, #RNG]
	dup v3.4h, w4
	mvn x7, x7 // ~dif
	b.hs 9f

	// refill
	ldp x3, x4, [x0] // BUF_POS, BUF_END
	add x5, x3, #8
	cmp x5, x4
	b.gt 2f

	ldr x3, [x3] // next_bits
	add w8, w6, #23 // shift_bits = cnt + 23
	add w6, w6, #16 // cnt += 16
	rev x3, x3 // next_bits = bswap(next_bits)
	sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
	and w8, w8, #24 // shift_bits &= 24
	lsr x3, x3, x8 // next_bits >>= shift_bits
	sub w8, w8, w6 // shift_bits -= 16 + cnt
	str x5, [x0, #BUF_POS]
	lsl x3, x3, x8 // next_bits <<= shift_bits
	mov w4, #48
	sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
	eor x7, x7, x3 // dif ^= next_bits
	b 9f

	2: // refill_eob
	mov w14, #40
	sub w5, w14, w6 // c = 40 - cnt
	3:
	cmp x3, x4
	b.ge 4f
	ldrb w8, [x3], #1
	lsl x8, x8, x5
	eor x7, x7, x8
	subs w5, w5, #8
	b.ge 3b

	4: // refill_eob_end
	str x3, [x0, #BUF_POS]
	sub w6, w14, w5 // cnt = 40 - c

	9:
	lsl w15, w15, #1
	sub w15, w15, #5
	lsr x12, x7, #48
	adds w13, w13, w15 // carry = tok_br < 3 \|\| tok == 15
	dup v1.8h, w12
	b.cc 1b // loop if !carry
	add w13, w13, #30
	str w6, [x0, #CNT]
	add sp, sp, #48
	str x7, [x0, #DIF]
	lsr w0, w13, #1
	ret
	endfunc

	function msac_decode_bool_equi_neon, export=1
	ldp w5, w6, [x0, #RNG] // + CNT
	sub sp, sp, #48
	ldr x7, [x0, #DIF]
	bic w4, w5, #0xff // r &= 0xff00
	add w4, w4, #8
	subs x8, x7, x4, lsl #47 // dif - vw
	lsr w4, w4, #1 // v
	sub w5, w5, w4 // r - v
	cset w15, lo
	csel w4, w5, w4, hs // if (ret) v = r - v;
	csel x7, x8, x7, hs // if (ret) dif = dif - vw;

	clz w5, w4 // clz(rng)
	mvn x7, x7 // ~dif
	eor w5, w5, #16 // d = clz(rng) ^ 16
	b L(renorm2)
	endfunc

	function msac_decode_bool_neon, export=1
	ldp w5, w6, [x0, #RNG] // + CNT
	sub sp, sp, #48
	ldr x7, [x0, #DIF]
	lsr w4, w5, #8 // r >> 8
	bic w1, w1, #0x3f // f &= ~63
	mul w4, w4, w1
	lsr w4, w4, #7
	add w4, w4, #4 // v
	subs x8, x7, x4, lsl #48 // dif - vw
	sub w5, w5, w4 // r - v
	cset w15, lo
	csel w4, w5, w4, hs // if (ret) v = r - v;
	csel x7, x8, x7, hs // if (ret) dif = dif - vw;

	clz w5, w4 // clz(rng)
	mvn x7, x7 // ~dif
	eor w5, w5, #16 // d = clz(rng) ^ 16
	b L(renorm2)
	endfunc

	function msac_decode_bool_adapt_neon, export=1
	ldr w9, [x1] // cdf[0-1]
	ldp w5, w6, [x0, #RNG] // + CNT
	sub sp, sp, #48
	ldr x7, [x0, #DIF]
	lsr w4, w5, #8 // r >> 8
	and w2, w9, #0xffc0 // f &= ~63
	mul w4, w4, w2
	lsr w4, w4, #7
	add w4, w4, #4 // v
	subs x8, x7, x4, lsl #48 // dif - vw
	sub w5, w5, w4 // r - v
	cset w15, lo
	csel w4, w5, w4, hs // if (ret) v = r - v;
	csel x7, x8, x7, hs // if (ret) dif = dif - vw;

	ldr w10, [x0, #ALLOW_UPDATE_CDF]

	clz w5, w4 // clz(rng)
	mvn x7, x7 // ~dif
	eor w5, w5, #16 // d = clz(rng) ^ 16

	cbz w10, L(renorm2)

	lsr w2, w9, #16 // count = cdf[1]
	and w9, w9, #0xffff // cdf[0]

	sub w3, w2, w2, lsr #5 // count - (count >= 32)
	lsr w2, w2, #4 // count >> 4
	add w10, w3, #1 // count + (count < 32)
	add w2, w2, #4 // rate = (count >> 4) \| 4

	sub w9, w9, w15 // cdf[0] -= bit
	sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
	asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
	sub w9, w9, w11 // cdf[0]

	strh w9, [x1]
	strh w10, [x1, #2]

	b L(renorm2)
	endfunc