third_party/dav1d/libdav1d/src/arm/64/cdef_tmpl.S - cobalt - Git at Google

 /*
  * Copyright © 2018, VideoLAN and dav1d authors
  * Copyright © 2020, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "src/arm/asm.S"
 #include "util.S"

 .macro dir_table w, stride
 const directions\w
         .byte           -1 * \stride + 1, -2 * \stride + 2
         .byte            0 * \stride + 1, -1 * \stride + 2
         .byte            0 * \stride + 1,  0 * \stride + 2
         .byte            0 * \stride + 1,  1 * \stride + 2
         .byte            1 * \stride + 1,  2 * \stride + 2
         .byte            1 * \stride + 0,  2 * \stride + 1
         .byte            1 * \stride + 0,  2 * \stride + 0
         .byte            1 * \stride + 0,  2 * \stride - 1
 // Repeated, to avoid & 7
         .byte           -1 * \stride + 1, -2 * \stride + 2
         .byte            0 * \stride + 1, -1 * \stride + 2
         .byte            0 * \stride + 1,  0 * \stride + 2
         .byte            0 * \stride + 1,  1 * \stride + 2
         .byte            1 * \stride + 1,  2 * \stride + 2
         .byte            1 * \stride + 0,  2 * \stride + 1
 endconst
 .endm

 .macro tables
 dir_table 8, 16
 dir_table 4, 8

 const pri_taps
         .byte           4, 2, 3, 3
 endconst
 .endm

 .macro load_px d1, d2, w
 .if \w == 8
         add             x6,  x2,  w9, sxtb #1       // x + off
         sub             x9,  x2,  w9, sxtb #1       // x - off
         ld1             {\d1\().8h}, [x6]           // p0
         ld1             {\d2\().8h}, [x9]           // p1
 .else
         add             x6,  x2,  w9, sxtb #1       // x + off
         sub             x9,  x2,  w9, sxtb #1       // x - off
         ld1             {\d1\().4h}, [x6]           // p0
         add             x6,  x6,  #2*8              // += stride
         ld1             {\d2\().4h}, [x9]           // p1
         add             x9,  x9,  #2*8              // += stride
         ld1             {\d1\().d}[1], [x6]         // p0
         ld1             {\d2\().d}[1], [x9]         // p1
 .endif
 .endm
 .macro handle_pixel s1, s2, thresh_vec, shift, tap, min
 .if \min
         umin            v2.8h,   v2.8h,  \s1\().8h
         smax            v3.8h,   v3.8h,  \s1\().8h
         umin            v2.8h,   v2.8h,  \s2\().8h
         smax            v3.8h,   v3.8h,  \s2\().8h
 .endif
         uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
         uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
         ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
         ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
         uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
         uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
         sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
         sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
         neg             v16.8h, v17.8h              // -clip
         neg             v20.8h, v21.8h              // -clip
         smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
         smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
         dup             v19.8h, \tap                // taps[k]
         smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
         smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
         mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
         mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
 .endm

 // void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
 //                                   const uint16_t *tmp, int pri_strength,
 //                                   int sec_strength, int dir, int damping,
 //                                   int h, size_t edges);
 .macro filter_func w, bpc, pri, sec, min, suffix
 function cdef_filter\w\suffix\()_\bpc\()bpc_neon
 .if \bpc == 8
         ldr             w8,  [sp]                   // edges
         cmp             w8,  #0xf
         b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
 .endif
 .if \pri
 .if \bpc == 16
         ldr             w9,  [sp, #8]               // bitdepth_max
         clz             w9,  w9
         sub             w9,  w9,  #24               // -bitdepth_min_8
         neg             w9,  w9                     // bitdepth_min_8
 .endif
         movrel          x8,  pri_taps
 .if \bpc == 16
         lsr             w9,  w3,  w9                // pri_strength >> bitdepth_min_8
         and             w9,  w9,  #1                // (pri_strength >> bitdepth_min_8) & 1
 .else
         and             w9,  w3,  #1
 .endif
         add             x8,  x8,  w9, uxtw #1
 .endif
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
         movi            v30.4h,   #15
         dup             v28.4h,   w6                // damping

 .if \pri
         dup             v25.8h, w3                  // threshold
 .endif
 .if \sec
         dup             v27.8h, w4                  // threshold
 .endif
         trn1            v24.4h, v25.4h, v27.4h
         clz             v24.4h, v24.4h              // clz(threshold)
         sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
         uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
         neg             v24.4h, v24.4h              // -shift
 .if \sec
         dup             v26.8h, v24.h[1]
 .endif
 .if \pri
         dup             v24.8h, v24.h[0]
 .endif

 1:
 .if \w == 8
         ld1             {v0.8h}, [x2]               // px
 .else
         add             x12, x2,  #2*8
         ld1             {v0.4h},   [x2]             // px
         ld1             {v0.d}[1], [x12]            // px
 .endif

         movi            v1.8h,  #0                  // sum
 .if \min
         mov             v2.16b, v0.16b              // min
         mov             v3.16b, v0.16b              // max
 .endif

         // Instead of loading sec_taps 2, 1 from memory, just set it
         // to 2 initially and decrease for the second round.
         // This is also used as loop counter.
         mov             w11, #2                     // sec_taps[0]

 2:
 .if \pri
         ldrb            w9,  [x5]                   // off1

         load_px         v4,  v5, \w
 .endif

 .if \sec
         add             x5,  x5,  #4                // +2*2
         ldrb            w9,  [x5]                   // off2
         load_px         v6,  v7,  \w
 .endif

 .if \pri
         ldrb            w10, [x8]                   // *pri_taps

         handle_pixel    v4,  v5,  v25.8h, v24.8h, w10, \min
 .endif

 .if \sec
         add             x5,  x5,  #8                // +2*4
         ldrb            w9,  [x5]                   // off3
         load_px         v4,  v5,  \w

         handle_pixel    v6,  v7,  v27.8h, v26.8h, w11, \min

         handle_pixel    v4,  v5,  v27.8h, v26.8h, w11, \min

         sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
 .else
         add             x5,  x5,  #1                // x5 += 1
 .endif
         subs            w11, w11, #1                // sec_tap-- (value)
 .if \pri
         add             x8,  x8,  #1                // pri_taps++ (pointer)
 .endif
         b.ne            2b

         cmlt            v4.8h,  v1.8h,  #0          // -(sum < 0)
         add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
         srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
         add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
 .if \min
         smin            v0.8h,  v0.8h,  v3.8h
         smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
 .endif
 .if \bpc == 8
         xtn             v0.8b,  v0.8h
 .endif
 .if \w == 8
         add             x2,  x2,  #2*16             // tmp += tmp_stride
         subs            w7,  w7,  #1                // h--
 .if \bpc == 8
         st1             {v0.8b}, [x0], x1
 .else
         st1             {v0.8h}, [x0], x1
 .endif
 .else
 .if \bpc == 8
         st1             {v0.s}[0], [x0], x1
 .else
         st1             {v0.d}[0], [x0], x1
 .endif
         add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
         subs            w7,  w7,  #2                // h -= 2
 .if \bpc == 8
         st1             {v0.s}[1], [x0], x1
 .else
         st1             {v0.d}[1], [x0], x1
 .endif
 .endif

         // Reset pri_taps and directions back to the original point
         sub             x5,  x5,  #2
 .if \pri
         sub             x8,  x8,  #2
 .endif

         b.gt            1b
         ret
 endfunc
 .endm

 .macro filter w, bpc
 filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
 filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
 filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec

 function cdef_filter\w\()_\bpc\()bpc_neon, export=1
         cbnz            w3,  1f // pri_strength
         b               cdef_filter\w\()_sec_\bpc\()bpc_neon     // only sec
 1:
         cbnz            w4,  1f // sec_strength
         b               cdef_filter\w\()_pri_\bpc\()bpc_neon     // only pri
 1:
         b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
 endfunc
 .endm

 const div_table
         .short         840, 420, 280, 210, 168, 140, 120, 105
 endconst

 const alt_fact
         .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
 endconst

 .macro cost_alt d1, d2, s1, s2, s3, s4
         smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
         smull2          v23.4s,  \s1\().8h, \s1\().8h
         smull           v24.4s,  \s2\().4h, \s2\().4h
         smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
         smull2          v26.4s,  \s3\().8h, \s3\().8h
         smull           v27.4s,  \s4\().4h, \s4\().4h
         mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
         mla             v22.4s,  v23.4s,  v30.4s
         mla             v22.4s,  v24.4s,  v31.4s
         mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
         mla             v25.4s,  v26.4s,  v30.4s
         mla             v25.4s,  v27.4s,  v31.4s
         addv            \d1, v22.4s                   // *cost_ptr
         addv            \d2, v25.4s                   // *cost_ptr
 .endm

 .macro find_best s1, s2, s3
 .ifnb \s2
         mov             w5,  \s2\().s[0]
 .endif
         cmp             w4,  w1                       // cost[n] > best_cost
         csel            w0,  w3,  w0,  gt             // best_dir = n
         csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
 .ifnb \s2
         add             w3,  w3,  #1                  // n++
         cmp             w5,  w1                       // cost[n] > best_cost
         mov             w4,  \s3\().s[0]
         csel            w0,  w3,  w0,  gt             // best_dir = n
         csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
         add             w3,  w3,  #1                  // n++
 .endif
 .endm

 // Steps for loading and preparing each row
 .macro dir_load_step1 s1, bpc
 .if \bpc == 8
         ld1             {\s1\().8b}, [x0], x1
 .else
         ld1             {\s1\().8h}, [x0], x1
 .endif
 .endm

 .macro dir_load_step2 s1, bpc
 .if \bpc == 8
         usubl           \s1\().8h,  \s1\().8b, v31.8b
 .else
         ushl            \s1\().8h,  \s1\().8h, v8.8h
 .endif
 .endm

 .macro dir_load_step3 s1, bpc
 // Nothing for \bpc == 8
 .if \bpc != 8
         sub             \s1\().8h,  \s1\().8h, v31.8h
 .endif
 .endm

 // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
 //                                   unsigned *const var)
 .macro find_dir bpc
 function cdef_find_dir_\bpc\()bpc_neon, export=1
 .if \bpc == 16
         str             d8,  [sp, #-0x10]!
         clz             w3,  w3                       // clz(bitdepth_max)
         sub             w3,  w3,  #24                 // -bitdepth_min_8
         dup             v8.8h,   w3
 .endif
         sub             sp,  sp,  #32 // cost
         mov             w3,  #8
 .if \bpc == 8
         movi            v31.16b, #128
 .else
         movi            v31.8h,  #128
 .endif
         movi            v30.16b, #0
         movi            v1.8h,   #0 // v0-v1 sum_diag[0]
         movi            v3.8h,   #0 // v2-v3 sum_diag[1]
         movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
         movi            v7.8h,   #0 // v6-v7 sum_alt[0]
         dir_load_step1  v26, \bpc       // Setup first row early
         movi            v17.8h,  #0 // v16-v17 sum_alt[1]
         movi            v18.8h,  #0 // v18-v19 sum_alt[2]
         dir_load_step2  v26, \bpc
         movi            v19.8h,  #0
         dir_load_step3  v26, \bpc
         movi            v21.8h,  #0 // v20-v21 sum_alt[3]

 .irpc i, 01234567
         addv            h25,     v26.8h               // [y]
         rev64           v27.8h,  v26.8h
         addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
         add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
         ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
         rev64           v29.4h,  v28.4h               // [-(x >> 1)]
         ins             v4.h[\i], v25.h[0]            // sum_hv[0]
 .if \i < 6
         ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
         ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
         add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
         add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
 .else
         add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
 .endif
 .if \i == 0
         mov             v20.16b, v26.16b              // sum_alt[3]
 .elseif \i == 1
         add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
 .else
         ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
         ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
         add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
         add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
 .endif
 .if \i == 0
         mov             v0.16b,  v26.16b              // sum_diag[0]
         dir_load_step1  v26, \bpc
         mov             v2.16b,  v27.16b              // sum_diag[1]
         dir_load_step2  v26, \bpc
         mov             v6.16b,  v28.16b              // sum_alt[0]
         dir_load_step3  v26, \bpc
         mov             v16.16b, v29.16b              // sum_alt[1]
 .else
         ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
         ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
         ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
         ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
 .if \i != 7 // Nothing to load for the final row
         dir_load_step1  v26, \bpc // Start setting up the next row early.
 .endif
         add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
         add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
         add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
         add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
 .if \i != 7
         dir_load_step2  v26, \bpc
 .endif
         ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
         ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
         ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
         ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
 .if \i != 7
         dir_load_step3  v26, \bpc
 .endif
         add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
         add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
         add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
         add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
 .endif
 .endr

         movi            v31.4s,  #105

         smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
         smlal2          v26.4s,  v4.8h,   v4.8h
         smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
         smlal2          v27.4s,  v5.8h,   v5.8h
         mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
         mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
         addv            s4,  v26.4s                   // cost[2]
         addv            s5,  v27.4s                   // cost[6]

         rev64           v1.8h,   v1.8h
         rev64           v3.8h,   v3.8h
         ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
         ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]

         str             s4,  [sp, #2*4]               // cost[2]
         str             s5,  [sp, #6*4]               // cost[6]

         movrel          x4,  div_table
         ld1             {v31.8h}, [x4]

         smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
         smull2          v23.4s,  v0.8h,   v0.8h
         smlal           v22.4s,  v1.4h,   v1.4h
         smlal2          v23.4s,  v1.8h,   v1.8h
         smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
         smull2          v25.4s,  v2.8h,   v2.8h
         smlal           v24.4s,  v3.4h,   v3.4h
         smlal2          v25.4s,  v3.8h,   v3.8h
         uxtl            v30.4s,  v31.4h               // div_table
         uxtl2           v31.4s,  v31.8h
         mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
         mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
         mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
         mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
         addv            s0,  v22.4s                   // cost[0]
         addv            s2,  v24.4s                   // cost[4]

         movrel          x5,  alt_fact
         ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105

         str             s0,  [sp, #0*4]               // cost[0]
         str             s2,  [sp, #4*4]               // cost[4]

         uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
         uxtl            v30.4s,  v30.4h
         uxtl            v31.4s,  v31.4h

         cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
         cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
         str             s6,  [sp, #1*4]               // cost[1]
         str             s16, [sp, #3*4]               // cost[3]

         mov             w0,  #0                       // best_dir
         mov             w1,  v0.s[0]                  // best_cost
         mov             w3,  #1                       // n

         str             s18, [sp, #5*4]               // cost[5]
         str             s20, [sp, #7*4]               // cost[7]

         mov             w4,  v6.s[0]

         find_best       v6,  v4, v16
         find_best       v16, v2, v18
         find_best       v18, v5, v20
         find_best       v20

         eor             w3,  w0,  #4                  // best_dir ^4
         ldr             w4,  [sp, w3, uxtw #2]
         sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
         lsr             w1,  w1,  #10
         str             w1,  [x2]                     // *var

         add             sp,  sp,  #32
 .if \bpc == 16
         ldr             d8,  [sp], 0x10
 .endif
         ret
 endfunc
 .endm
	/*
	* Copyright © 2018, VideoLAN and dav1d authors
	* Copyright © 2020, Martin Storsjo
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "src/arm/asm.S"
	#include "util.S"

	.macro dir_table w, stride
	const directions\w
	.byte -1 * \stride + 1, -2 * \stride + 2
	.byte 0 * \stride + 1, -1 * \stride + 2
	.byte 0 * \stride + 1, 0 * \stride + 2
	.byte 0 * \stride + 1, 1 * \stride + 2
	.byte 1 * \stride + 1, 2 * \stride + 2
	.byte 1 * \stride + 0, 2 * \stride + 1
	.byte 1 * \stride + 0, 2 * \stride + 0
	.byte 1 * \stride + 0, 2 * \stride - 1
	// Repeated, to avoid & 7
	.byte -1 * \stride + 1, -2 * \stride + 2
	.byte 0 * \stride + 1, -1 * \stride + 2
	.byte 0 * \stride + 1, 0 * \stride + 2
	.byte 0 * \stride + 1, 1 * \stride + 2
	.byte 1 * \stride + 1, 2 * \stride + 2
	.byte 1 * \stride + 0, 2 * \stride + 1
	endconst
	.endm

	.macro tables
	dir_table 8, 16
	dir_table 4, 8

	const pri_taps
	.byte 4, 2, 3, 3
	endconst
	.endm

	.macro load_px d1, d2, w
	.if \w == 8
	add x6, x2, w9, sxtb #1 // x + off
	sub x9, x2, w9, sxtb #1 // x - off
	ld1 {\d1\().8h}, [x6] // p0
	ld1 {\d2\().8h}, [x9] // p1
	.else
	add x6, x2, w9, sxtb #1 // x + off
	sub x9, x2, w9, sxtb #1 // x - off
	ld1 {\d1\().4h}, [x6] // p0
	add x6, x6, #2*8 // += stride
	ld1 {\d2\().4h}, [x9] // p1
	add x9, x9, #2*8 // += stride
	ld1 {\d1\().d}[1], [x6] // p0
	ld1 {\d2\().d}[1], [x9] // p1
	.endif
	.endm
	.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
	.if \min
	umin v2.8h, v2.8h, \s1\().8h
	smax v3.8h, v3.8h, \s1\().8h
	umin v2.8h, v2.8h, \s2\().8h
	smax v3.8h, v3.8h, \s2\().8h
	.endif
	uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
	uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
	ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
	ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
	uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
	uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
	sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
	sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
	neg v16.8h, v17.8h // -clip
	neg v20.8h, v21.8h // -clip
	smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
	smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
	dup v19.8h, \tap // taps[k]
	smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
	smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
	mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
	mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
	.endm

	// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
	// const uint16_t *tmp, int pri_strength,
	// int sec_strength, int dir, int damping,
	// int h, size_t edges);
	.macro filter_func w, bpc, pri, sec, min, suffix
	function cdef_filter\w\suffix\()_\bpc\()bpc_neon
	.if \bpc == 8
	ldr w8, [sp] // edges
	cmp w8, #0xf
	b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
	.endif
	.if \pri
	.if \bpc == 16
	ldr w9, [sp, #8] // bitdepth_max
	clz w9, w9
	sub w9, w9, #24 // -bitdepth_min_8
	neg w9, w9 // bitdepth_min_8
	.endif
	movrel x8, pri_taps
	.if \bpc == 16
	lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
	and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
	.else
	and w9, w3, #1
	.endif
	add x8, x8, w9, uxtw #1
	.endif
	movrel x9, directions\w
	add x5, x9, w5, uxtw #1
	movi v30.4h, #15
	dup v28.4h, w6 // damping

	.if \pri
	dup v25.8h, w3 // threshold
	.endif
	.if \sec
	dup v27.8h, w4 // threshold
	.endif
	trn1 v24.4h, v25.4h, v27.4h
	clz v24.4h, v24.4h // clz(threshold)
	sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
	uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
	neg v24.4h, v24.4h // -shift
	.if \sec
	dup v26.8h, v24.h[1]
	.endif
	.if \pri
	dup v24.8h, v24.h[0]
	.endif

	1:
	.if \w == 8
	ld1 {v0.8h}, [x2] // px
	.else
	add x12, x2, #2*8
	ld1 {v0.4h}, [x2] // px
	ld1 {v0.d}[1], [x12] // px
	.endif

	movi v1.8h, #0 // sum
	.if \min
	mov v2.16b, v0.16b // min
	mov v3.16b, v0.16b // max
	.endif

	// Instead of loading sec_taps 2, 1 from memory, just set it
	// to 2 initially and decrease for the second round.
	// This is also used as loop counter.
	mov w11, #2 // sec_taps[0]

	2:
	.if \pri
	ldrb w9, [x5] // off1

	load_px v4, v5, \w
	.endif

	.if \sec
	add x5, x5, #4 // +2*2
	ldrb w9, [x5] // off2
	load_px v6, v7, \w
	.endif

	.if \pri
	ldrb w10, [x8] // *pri_taps

	handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
	.endif

	.if \sec
	add x5, x5, #8 // +2*4
	ldrb w9, [x5] // off3
	load_px v4, v5, \w

	handle_pixel v6, v7, v27.8h, v26.8h, w11, \min

	handle_pixel v4, v5, v27.8h, v26.8h, w11, \min

	sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
	.else
	add x5, x5, #1 // x5 += 1
	.endif
	subs w11, w11, #1 // sec_tap-- (value)
	.if \pri
	add x8, x8, #1 // pri_taps++ (pointer)
	.endif
	b.ne 2b

	cmlt v4.8h, v1.8h, #0 // -(sum < 0)
	add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
	srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
	add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
	.if \min
	smin v0.8h, v0.8h, v3.8h
	smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
	.endif
	.if \bpc == 8
	xtn v0.8b, v0.8h
	.endif
	.if \w == 8
	add x2, x2, #2*16 // tmp += tmp_stride
	subs w7, w7, #1 // h--
	.if \bpc == 8
	st1 {v0.8b}, [x0], x1
	.else
	st1 {v0.8h}, [x0], x1
	.endif
	.else
	.if \bpc == 8
	st1 {v0.s}[0], [x0], x1
	.else
	st1 {v0.d}[0], [x0], x1
	.endif
	add x2, x2, #216 // tmp += 2tmp_stride
	subs w7, w7, #2 // h -= 2
	.if \bpc == 8
	st1 {v0.s}[1], [x0], x1
	.else
	st1 {v0.d}[1], [x0], x1
	.endif
	.endif

	// Reset pri_taps and directions back to the original point
	sub x5, x5, #2
	.if \pri
	sub x8, x8, #2
	.endif

	b.gt 1b
	ret
	endfunc
	.endm

	.macro filter w, bpc
	filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
	filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
	filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec

	function cdef_filter\w\()_\bpc\()bpc_neon, export=1
	cbnz w3, 1f // pri_strength
	b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
	1:
	cbnz w4, 1f // sec_strength
	b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
	1:
	b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
	endfunc
	.endm

	const div_table
	.short 840, 420, 280, 210, 168, 140, 120, 105
	endconst

	const alt_fact
	.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
	endconst

	.macro cost_alt d1, d2, s1, s2, s3, s4
	smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
	smull2 v23.4s, \s1\().8h, \s1\().8h
	smull v24.4s, \s2\().4h, \s2\().4h
	smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
	smull2 v26.4s, \s3\().8h, \s3\().8h
	smull v27.4s, \s4\().4h, \s4\().4h
	mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
	mla v22.4s, v23.4s, v30.4s
	mla v22.4s, v24.4s, v31.4s
	mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
	mla v25.4s, v26.4s, v30.4s
	mla v25.4s, v27.4s, v31.4s
	addv \d1, v22.4s // *cost_ptr
	addv \d2, v25.4s // *cost_ptr
	.endm

	.macro find_best s1, s2, s3
	.ifnb \s2
	mov w5, \s2\().s[0]
	.endif
	cmp w4, w1 // cost[n] > best_cost
	csel w0, w3, w0, gt // best_dir = n
	csel w1, w4, w1, gt // best_cost = cost[n]
	.ifnb \s2
	add w3, w3, #1 // n++
	cmp w5, w1 // cost[n] > best_cost
	mov w4, \s3\().s[0]
	csel w0, w3, w0, gt // best_dir = n
	csel w1, w5, w1, gt // best_cost = cost[n]
	add w3, w3, #1 // n++
	.endif
	.endm

	// Steps for loading and preparing each row
	.macro dir_load_step1 s1, bpc
	.if \bpc == 8
	ld1 {\s1\().8b}, [x0], x1
	.else
	ld1 {\s1\().8h}, [x0], x1
	.endif
	.endm

	.macro dir_load_step2 s1, bpc
	.if \bpc == 8
	usubl \s1\().8h, \s1\().8b, v31.8b
	.else
	ushl \s1\().8h, \s1\().8h, v8.8h
	.endif
	.endm

	.macro dir_load_step3 s1, bpc
	// Nothing for \bpc == 8
	.if \bpc != 8
	sub \s1\().8h, \s1\().8h, v31.8h
	.endif
	.endm

	// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
	// unsigned *const var)
	.macro find_dir bpc
	function cdef_find_dir_\bpc\()bpc_neon, export=1
	.if \bpc == 16
	str d8, [sp, #-0x10]!
	clz w3, w3 // clz(bitdepth_max)
	sub w3, w3, #24 // -bitdepth_min_8
	dup v8.8h, w3
	.endif
	sub sp, sp, #32 // cost
	mov w3, #8
	.if \bpc == 8
	movi v31.16b, #128
	.else
	movi v31.8h, #128
	.endif
	movi v30.16b, #0
	movi v1.8h, #0 // v0-v1 sum_diag[0]
	movi v3.8h, #0 // v2-v3 sum_diag[1]
	movi v5.8h, #0 // v4-v5 sum_hv[0-1]
	movi v7.8h, #0 // v6-v7 sum_alt[0]
	dir_load_step1 v26, \bpc // Setup first row early
	movi v17.8h, #0 // v16-v17 sum_alt[1]
	movi v18.8h, #0 // v18-v19 sum_alt[2]
	dir_load_step2 v26, \bpc
	movi v19.8h, #0
	dir_load_step3 v26, \bpc
	movi v21.8h, #0 // v20-v21 sum_alt[3]

	.irpc i, 01234567
	addv h25, v26.8h // [y]
	rev64 v27.8h, v26.8h
	addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
	add v5.8h, v5.8h, v26.8h // sum_hv[1]
	ext v27.16b, v27.16b, v27.16b, #8 // [-x]
	rev64 v29.4h, v28.4h // [-(x >> 1)]
	ins v4.h[\i], v25.h[0] // sum_hv[0]
	.if \i < 6
	ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
	ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
	add v18.8h, v18.8h, v22.8h // sum_alt[2]
	add v19.4h, v19.4h, v23.4h // sum_alt[2]
	.else
	add v18.8h, v18.8h, v26.8h // sum_alt[2]
	.endif
	.if \i == 0
	mov v20.16b, v26.16b // sum_alt[3]
	.elseif \i == 1
	add v20.8h, v20.8h, v26.8h // sum_alt[3]
	.else
	ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
	ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
	add v20.8h, v20.8h, v24.8h // sum_alt[3]
	add v21.4h, v21.4h, v25.4h // sum_alt[3]
	.endif
	.if \i == 0
	mov v0.16b, v26.16b // sum_diag[0]
	dir_load_step1 v26, \bpc
	mov v2.16b, v27.16b // sum_diag[1]
	dir_load_step2 v26, \bpc
	mov v6.16b, v28.16b // sum_alt[0]
	dir_load_step3 v26, \bpc
	mov v16.16b, v29.16b // sum_alt[1]
	.else
	ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
	ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
	ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
	ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
	.if \i != 7 // Nothing to load for the final row
	dir_load_step1 v26, \bpc // Start setting up the next row early.
	.endif
	add v0.8h, v0.8h, v22.8h // sum_diag[0]
	add v1.8h, v1.8h, v23.8h // sum_diag[0]
	add v2.8h, v2.8h, v24.8h // sum_diag[1]
	add v3.8h, v3.8h, v25.8h // sum_diag[1]
	.if \i != 7
	dir_load_step2 v26, \bpc
	.endif
	ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
	ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
	ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
	ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
	.if \i != 7
	dir_load_step3 v26, \bpc
	.endif
	add v6.8h, v6.8h, v22.8h // sum_alt[0]
	add v7.4h, v7.4h, v23.4h // sum_alt[0]
	add v16.8h, v16.8h, v24.8h // sum_alt[1]
	add v17.4h, v17.4h, v25.4h // sum_alt[1]
	.endif
	.endr

	movi v31.4s, #105

	smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
	smlal2 v26.4s, v4.8h, v4.8h
	smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
	smlal2 v27.4s, v5.8h, v5.8h
	mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
	mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
	addv s4, v26.4s // cost[2]
	addv s5, v27.4s // cost[6]

	rev64 v1.8h, v1.8h
	rev64 v3.8h, v3.8h
	ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
	ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]

	str s4, [sp, #2*4] // cost[2]
	str s5, [sp, #6*4] // cost[6]

	movrel x4, div_table
	ld1 {v31.8h}, [x4]

	smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
	smull2 v23.4s, v0.8h, v0.8h
	smlal v22.4s, v1.4h, v1.4h
	smlal2 v23.4s, v1.8h, v1.8h
	smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
	smull2 v25.4s, v2.8h, v2.8h
	smlal v24.4s, v3.4h, v3.4h
	smlal2 v25.4s, v3.8h, v3.8h
	uxtl v30.4s, v31.4h // div_table
	uxtl2 v31.4s, v31.8h
	mul v22.4s, v22.4s, v30.4s // cost[0]
	mla v22.4s, v23.4s, v31.4s // cost[0]
	mul v24.4s, v24.4s, v30.4s // cost[4]
	mla v24.4s, v25.4s, v31.4s // cost[4]
	addv s0, v22.4s // cost[0]
	addv s2, v24.4s // cost[4]

	movrel x5, alt_fact
	ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105

	str s0, [sp, #0*4] // cost[0]
	str s2, [sp, #4*4] // cost[4]

	uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
	uxtl v30.4s, v30.4h
	uxtl v31.4s, v31.4h

	cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
	cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
	str s6, [sp, #1*4] // cost[1]
	str s16, [sp, #3*4] // cost[3]

	mov w0, #0 // best_dir
	mov w1, v0.s[0] // best_cost
	mov w3, #1 // n

	str s18, [sp, #5*4] // cost[5]
	str s20, [sp, #7*4] // cost[7]

	mov w4, v6.s[0]

	find_best v6, v4, v16
	find_best v16, v2, v18
	find_best v18, v5, v20
	find_best v20

	eor w3, w0, #4 // best_dir ^4
	ldr w4, [sp, w3, uxtw #2]
	sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
	lsr w1, w1, #10
	str w1, [x2] // *var

	add sp, sp, #32
	.if \bpc == 16
	ldr d8, [sp], 0x10
	.endif
	ret
	endfunc
	.endm