third_party/libdav1d/src/arm/64/cdef.S - cobalt - Git at Google

 /*
  * Copyright © 2018, VideoLAN and dav1d authors
  * Copyright © 2019, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "src/arm/asm.S"
 #include "util.S"

 .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
         tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
         sub             \s1,  \s1,  #2
         sub             \s2,  \s2,  #2
         tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             s1,      [\s1, #\w]
         ldr             \rn\()2, [\s2]
         ldr             s3,      [\s2, #\w]
         uxtl            v0.8h,   v0.8b
         uxtl            v1.8h,   v1.8b
         uxtl            v2.8h,   v2.8b
         uxtl            v3.8h,   v3.8b
         str             \rw\()0, [x0]
         str             d1,      [x0, #2*\w]
         add             x0,  x0,  #2*\stride
         str             \rw\()2, [x0]
         str             d3,      [x0, #2*\w]
 .if \ret
         ret
 .else
         add             x0,  x0,  #2*\stride
         b               3f
 .endif

 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             h1,      [\s1, #\w]
         ldr             \rn\()2, [\s2]
         ldr             h3,      [\s2, #\w]
         uxtl            v0.8h,   v0.8b
         uxtl            v1.8h,   v1.8b
         uxtl            v2.8h,   v2.8b
         uxtl            v3.8h,   v3.8b
         str             \rw\()0, [x0]
         str             s1,      [x0, #2*\w]
         str             s31,     [x0, #2*\w+4]
         add             x0,  x0,  #2*\stride
         str             \rw\()2, [x0]
         str             s3,      [x0, #2*\w]
         str             s31,     [x0, #2*\w+4]
 .if \ret
         ret
 .else
         add             x0,  x0,  #2*\stride
         b               3f
 .endif

 2:
         // !CDEF_HAVE_LEFT
         tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             h1,      [\s1, #\w]
         ldr             \rn\()2, [\s2]
         ldr             h3,      [\s2, #\w]
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         uxtl            v2.8h,  v2.8b
         uxtl            v3.8h,  v3.8b
         str             s31, [x0]
         stur            \rw\()0, [x0, #4]
         str             s1,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         str             s31, [x0]
         stur            \rw\()2, [x0, #4]
         str             s3,      [x0, #4+2*\w]
 .if \ret
         ret
 .else
         add             x0,  x0,  #2*\stride
         b               3f
 .endif

 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             \rn\()1, [\s2]
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         str             s31,     [x0]
         stur            \rw\()0, [x0, #4]
         str             s31,     [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         str             s31,     [x0]
         stur            \rw\()1, [x0, #4]
         str             s31,     [x0, #4+2*\w]
 .if \ret
         ret
 .else
         add             x0,  x0,  #2*\stride
 .endif
 3:
 .endm

 .macro load_n_incr dst, src, incr, w
 .if \w == 4
         ld1             {\dst\().s}[0], [\src], \incr
 .else
         ld1             {\dst\().8b},   [\src], \incr
 .endif
 .endm

 // void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
 //                               ptrdiff_t src_stride, const pixel (*left)[2],
 //                               /*const*/ pixel *const top[2], int h,
 //                               enum CdefEdgeFlags edges);

 .macro padding_func w, stride, rn, rw
 function cdef_padding\w\()_neon, export=1
         movi            v30.8h,  #0x80, lsl #8
         mov             v31.16b, v30.16b
         sub             x0,  x0,  #2*(2*\stride+2)
         tst             w6,  #4 // CDEF_HAVE_TOP
         b.ne            1f
         // !CDEF_HAVE_TOP
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         b               3f
 1:
         // CDEF_HAVE_TOP
         ldr             x8,  [x4]
         ldr             x9,  [x4, #8]
         pad_top_bottom  x8,  x9, \w, \stride, \rn, \rw, 0

         // Middle section
 3:
         tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
         tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ld1             {v0.h}[0], [x3], #2
         ldr             h2,      [x1, #\w]
         load_n_incr     v1,  x1,  x2,  \w
         subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         uxtl            v2.8h,  v2.8b
         str             s0,      [x0]
         stur            \rw\()1, [x0, #4]
         str             s2,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ld1             {v0.h}[0], [x3], #2
         load_n_incr     v1,  x1,  x2,  \w
         subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         str             s0,      [x0]
         stur            \rw\()1, [x0, #4]
         str             s31,     [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b
         b               3f
 2:
         tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ldr             h1,      [x1, #\w]
         load_n_incr     v0,  x1,  x2,  \w
         subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         str             s31,     [x0]
         stur            \rw\()0, [x0, #4]
         str             s1,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         load_n_incr     v0,  x1,  x2,  \w
         subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         str             s31,     [x0]
         stur            \rw\()0, [x0, #4]
         str             s31,     [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b

 3:
         tst             w6,  #8 // CDEF_HAVE_BOTTOM
         b.ne            1f
         // !CDEF_HAVE_BOTTOM
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         ret
 1:
         // CDEF_HAVE_BOTTOM
         add             x9,  x1,  x2
         pad_top_bottom  x1,  x9, \w, \stride, \rn, \rw, 1
 endfunc
 .endm

 padding_func 8, 16, d, q
 padding_func 4, 8,  s, d

 .macro dir_table w, stride
 const directions\w
         .byte           -1 * \stride + 1, -2 * \stride + 2
         .byte            0 * \stride + 1, -1 * \stride + 2
         .byte            0 * \stride + 1,  0 * \stride + 2
         .byte            0 * \stride + 1,  1 * \stride + 2
         .byte            1 * \stride + 1,  2 * \stride + 2
         .byte            1 * \stride + 0,  2 * \stride + 1
         .byte            1 * \stride + 0,  2 * \stride + 0
         .byte            1 * \stride + 0,  2 * \stride - 1
 // Repeated, to avoid & 7
         .byte           -1 * \stride + 1, -2 * \stride + 2
         .byte            0 * \stride + 1, -1 * \stride + 2
         .byte            0 * \stride + 1,  0 * \stride + 2
         .byte            0 * \stride + 1,  1 * \stride + 2
         .byte            1 * \stride + 1,  2 * \stride + 2
         .byte            1 * \stride + 0,  2 * \stride + 1
 endconst
 .endm

 dir_table 8, 16
 dir_table 4, 8

 const pri_taps
         .byte           4, 2, 3, 3
 endconst

 .macro load_px d1, d2, w
 .if \w == 8
         add             x6,  x2,  w9, sxtb #1       // x + off
         sub             x9,  x2,  w9, sxtb #1       // x - off
         ld1             {\d1\().8h}, [x6]           // p0
         ld1             {\d2\().8h}, [x9]           // p1
 .else
         add             x6,  x2,  w9, sxtb #1       // x + off
         sub             x9,  x2,  w9, sxtb #1       // x - off
         ld1             {\d1\().4h}, [x6]           // p0
         add             x6,  x6,  #2*8              // += stride
         ld1             {\d2\().4h}, [x9]           // p1
         add             x9,  x9,  #2*8              // += stride
         ld1             {\d1\().d}[1], [x6]         // p0
         ld1             {\d2\().d}[1], [x9]         // p1
 .endif
 .endm
 .macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
         umin            v2.8h,   v2.8h,  \s1\().8h
         smax            v3.8h,   v3.8h,  \s1\().8h
         umin            v2.8h,   v2.8h,  \s2\().8h
         smax            v3.8h,   v3.8h,  \s2\().8h

         cbz             \threshold, 3f
         uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
         uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
         ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
         ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
         uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
         uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
         sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
         sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
         neg             v16.8h, v17.8h              // -clip
         neg             v20.8h, v21.8h              // -clip
         smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
         smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
         dup             v19.8h, \tap                // taps[k]
         smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
         smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
         mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
         mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
 3:
 .endm

 // void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
 //                              const uint16_t *tmp, int pri_strength,
 //                              int sec_strength, int dir, int damping, int h);
 .macro filter w
 function cdef_filter\w\()_neon, export=1
         movrel          x8,  pri_taps
         and             w9,  w3,  #1
         add             x8,  x8,  w9, uxtw #1
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
         movi            v30.4h,   #15
         dup             v28.4h,   w6                // damping

         dup             v25.8h, w3                  // threshold
         dup             v27.8h, w4                  // threshold
         trn1            v24.4h, v25.4h, v27.4h
         clz             v24.4h, v24.4h              // clz(threshold)
         sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
         uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
         neg             v24.4h, v24.4h              // -shift
         dup             v26.8h, v24.h[1]
         dup             v24.8h, v24.h[0]

 1:
 .if \w == 8
         ld1             {v0.8h}, [x2]               // px
 .else
         add             x12, x2,  #2*8
         ld1             {v0.4h},   [x2]             // px
         ld1             {v0.d}[1], [x12]            // px
 .endif

         movi            v1.8h,  #0                  // sum
         mov             v2.16b, v0.16b              // min
         mov             v3.16b, v0.16b              // max

         // Instead of loading sec_taps 2, 1 from memory, just set it
         // to 2 initially and decrease for the second round.
         mov             w11, #2                     // sec_taps[0]

 2:
         ldrb            w9,  [x5]                   // off1

         load_px         v4,  v5, \w

         add             x5,  x5,  #4                // +2*2
         ldrb            w9,  [x5]                   // off2
         load_px         v6,  v7,  \w

         ldrb            w10, [x8]                   // *pri_taps

         handle_pixel    v4,  v5,  w3,  v25.8h, v24.8h, w10

         add             x5,  x5,  #8                // +2*4
         ldrb            w9,  [x5]                   // off3
         load_px         v4,  v5,  \w

         handle_pixel    v6,  v7,  w4,  v27.8h, v26.8h, w11

         handle_pixel    v4,  v5,  w4,  v27.8h, v26.8h, w11

         sub             x5,  x5,  #11               // x8 -= 2*(2+4); x8 += 1;
         subs            w11, w11, #1                // sec_tap-- (value)
         add             x8,  x8,  #1                // pri_taps++ (pointer)
         b.ne            2b

         sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
         add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
         srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
         add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
         smin            v0.8h,  v0.8h,  v3.8h
         smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
         xtn             v0.8b,  v0.8h
 .if \w == 8
         add             x2,  x2,  #2*16             // tmp += tmp_stride
         subs            w7,  w7,  #1                // h--
         st1             {v0.8b}, [x0], x1
 .else
         st1             {v0.s}[0], [x0], x1
         add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
         subs            w7,  w7,  #2                // h -= 2
         st1             {v0.s}[1], [x0], x1
 .endif

         // Reset pri_taps/sec_taps back to the original point
         sub             x5,  x5,  #2
         sub             x8,  x8,  #2

         b.gt            1b
         ret
 endfunc
 .endm

 filter 8
 filter 4

 const div_table
         .short         840, 420, 280, 210, 168, 140, 120, 105
 endconst

 const alt_fact
         .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
 endconst

 // int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
 //                              unsigned *const var)
 function cdef_find_dir_neon, export=1
         sub             sp,  sp,  #32 // cost
         mov             w3,  #8
         movi            v31.16b, #128
         movi            v30.16b, #0
         movi            v1.8h,   #0 // v0-v1 sum_diag[0]
         movi            v3.8h,   #0 // v2-v3 sum_diag[1]
         movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
         movi            v7.8h,   #0 // v6-v7 sum_alt[0]
         movi            v17.8h,  #0 // v16-v17 sum_alt[1]
         movi            v18.8h,  #0 // v18-v19 sum_alt[2]
         movi            v19.8h,  #0
         movi            v21.8h,  #0 // v20-v21 sum_alt[3]

 .irpc i, 01234567
         ld1             {v26.8b}, [x0], x1
         usubl           v26.8h,  v26.8b, v31.8b

         addv            h25,     v26.8h               // [y]
         rev64           v27.8h,  v26.8h
         addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
         add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
         ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
         rev64           v29.4h,  v28.4h               // [-(x >> 1)]
         ins             v4.h[\i], v25.h[0]            // sum_hv[0]

 .if \i == 0
         mov             v0.16b,  v26.16b              // sum_diag[0]
         mov             v2.16b,  v27.16b              // sum_diag[1]
         mov             v6.16b,  v28.16b              // sum_alt[0]
         mov             v16.16b, v29.16b              // sum_alt[1]
 .else
         ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
         ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
         ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
         ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
         add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
         add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
         add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
         add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
         ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
         ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
         ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
         ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
         add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
         add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
         add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
         add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
 .endif
 .if \i < 6
         ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
         ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
         add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
         add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
 .else
         add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
 .endif
 .if \i == 0
         mov             v20.16b, v26.16b              // sum_alt[3]
 .elseif \i == 1
         add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
 .else
         ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
         ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
         add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
         add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
 .endif
 .endr

         movi            v31.4s,  #105

         smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
         smlal2          v26.4s,  v4.8h,   v4.8h
         smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
         smlal2          v27.4s,  v5.8h,   v5.8h
         mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
         mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
         addv            s4,  v26.4s                   // cost[2]
         addv            s5,  v27.4s                   // cost[6]

         rev64           v1.8h,   v1.8h
         rev64           v3.8h,   v3.8h
         ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
         ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]

         str             s4,  [sp, #2*4]               // cost[2]
         str             s5,  [sp, #6*4]               // cost[6]

         movrel          x4,  div_table
         ld1             {v31.8h}, [x4]

         smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
         smull2          v23.4s,  v0.8h,   v0.8h
         smlal           v22.4s,  v1.4h,   v1.4h
         smlal2          v23.4s,  v1.8h,   v1.8h
         smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
         smull2          v25.4s,  v2.8h,   v2.8h
         smlal           v24.4s,  v3.4h,   v3.4h
         smlal2          v25.4s,  v3.8h,   v3.8h
         uxtl            v30.4s,  v31.4h               // div_table
         uxtl2           v31.4s,  v31.8h
         mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
         mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
         mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
         mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
         addv            s0,  v22.4s                   // cost[0]
         addv            s2,  v24.4s                   // cost[4]

         movrel          x5,  alt_fact
         ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105

         str             s0,  [sp, #0*4]               // cost[0]
         str             s2,  [sp, #4*4]               // cost[4]

         uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
         uxtl            v30.4s,  v30.4h
         uxtl            v31.4s,  v31.4h

 .macro cost_alt d1, d2, s1, s2, s3, s4
         smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
         smull2          v23.4s,  \s1\().8h, \s1\().8h
         smull           v24.4s,  \s2\().4h, \s2\().4h
         smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
         smull2          v26.4s,  \s3\().8h, \s3\().8h
         smull           v27.4s,  \s4\().4h, \s4\().4h
         mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
         mla             v22.4s,  v23.4s,  v30.4s
         mla             v22.4s,  v24.4s,  v31.4s
         mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
         mla             v25.4s,  v26.4s,  v30.4s
         mla             v25.4s,  v27.4s,  v31.4s
         addv            \d1, v22.4s                   // *cost_ptr
         addv            \d2, v25.4s                   // *cost_ptr
 .endm
         cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
         cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
         str             s6,  [sp, #1*4]               // cost[1]
         str             s16, [sp, #3*4]               // cost[3]

         mov             w0,  #0                       // best_dir
         mov             w1,  v0.s[0]                  // best_cost
         mov             w3,  #1                       // n

         str             s18, [sp, #5*4]               // cost[5]
         str             s20, [sp, #7*4]               // cost[7]

         mov             w4,  v6.s[0]

 .macro find_best s1, s2, s3
 .ifnb \s2
         mov             w5,  \s2\().s[0]
 .endif
         cmp             w4,  w1                       // cost[n] > best_cost
         csel            w0,  w3,  w0,  gt             // best_dir = n
         csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
 .ifnb \s2
         add             w3,  w3,  #1                  // n++
         cmp             w5,  w1                       // cost[n] > best_cost
         mov             w4,  \s3\().s[0]
         csel            w0,  w3,  w0,  gt             // best_dir = n
         csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
         add             w3,  w3,  #1                  // n++
 .endif
 .endm
         find_best       v6,  v4, v16
         find_best       v16, v2, v18
         find_best       v18, v5, v20
         find_best       v20

         eor             w3,  w0,  #4                  // best_dir ^4
         ldr             w4,  [sp, w3, uxtw #2]
         sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
         lsr             w1,  w1,  #10
         str             w1,  [x2]                     // *var

         add             sp,  sp,  #32
         ret
 endfunc
	/*
	* Copyright © 2018, VideoLAN and dav1d authors
	* Copyright © 2019, Martin Storsjo
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "src/arm/asm.S"
	#include "util.S"

	.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
	tst w6, #1 // CDEF_HAVE_LEFT
	b.eq 2f
	// CDEF_HAVE_LEFT
	sub \s1, \s1, #2
	sub \s2, \s2, #2
	tst w6, #2 // CDEF_HAVE_RIGHT
	b.eq 1f
	// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
	ldr \rn\()0, [\s1]
	ldr s1, [\s1, #\w]
	ldr \rn\()2, [\s2]
	ldr s3, [\s2, #\w]
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	uxtl v2.8h, v2.8b
	uxtl v3.8h, v3.8b
	str \rw\()0, [x0]
	str d1, [x0, #2*\w]
	add x0, x0, #2*\stride
	str \rw\()2, [x0]
	str d3, [x0, #2*\w]
	.if \ret
	ret
	.else
	add x0, x0, #2*\stride
	b 3f
	.endif

	1:
	// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
	ldr \rn\()0, [\s1]
	ldr h1, [\s1, #\w]
	ldr \rn\()2, [\s2]
	ldr h3, [\s2, #\w]
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	uxtl v2.8h, v2.8b
	uxtl v3.8h, v3.8b
	str \rw\()0, [x0]
	str s1, [x0, #2*\w]
	str s31, [x0, #2*\w+4]
	add x0, x0, #2*\stride
	str \rw\()2, [x0]
	str s3, [x0, #2*\w]
	str s31, [x0, #2*\w+4]
	.if \ret
	ret
	.else
	add x0, x0, #2*\stride
	b 3f
	.endif

	2:
	// !CDEF_HAVE_LEFT
	tst w6, #2 // CDEF_HAVE_RIGHT
	b.eq 1f
	// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
	ldr \rn\()0, [\s1]
	ldr h1, [\s1, #\w]
	ldr \rn\()2, [\s2]
	ldr h3, [\s2, #\w]
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	uxtl v2.8h, v2.8b
	uxtl v3.8h, v3.8b
	str s31, [x0]
	stur \rw\()0, [x0, #4]
	str s1, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	str s31, [x0]
	stur \rw\()2, [x0, #4]
	str s3, [x0, #4+2*\w]
	.if \ret
	ret
	.else
	add x0, x0, #2*\stride
	b 3f
	.endif

	1:
	// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
	ldr \rn\()0, [\s1]
	ldr \rn\()1, [\s2]
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	str s31, [x0]
	stur \rw\()0, [x0, #4]
	str s31, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	str s31, [x0]
	stur \rw\()1, [x0, #4]
	str s31, [x0, #4+2*\w]
	.if \ret
	ret
	.else
	add x0, x0, #2*\stride
	.endif
	3:
	.endm

	.macro load_n_incr dst, src, incr, w
	.if \w == 4
	ld1 {\dst\().s}[0], [\src], \incr
	.else
	ld1 {\dst\().8b}, [\src], \incr
	.endif
	.endm

	// void dav1d_cdef_paddingX_neon(uint16_t tmp, const pixel src,
	// ptrdiff_t src_stride, const pixel (*left)[2],
	// /const/ pixel *const top[2], int h,
	// enum CdefEdgeFlags edges);

	.macro padding_func w, stride, rn, rw
	function cdef_padding\w\()_neon, export=1
	movi v30.8h, #0x80, lsl #8
	mov v31.16b, v30.16b
	sub x0, x0, #2(2\stride+2)
	tst w6, #4 // CDEF_HAVE_TOP
	b.ne 1f
	// !CDEF_HAVE_TOP
	st1 {v30.8h, v31.8h}, [x0], #32
	.if \w == 8
	st1 {v30.8h, v31.8h}, [x0], #32
	.endif
	b 3f
	1:
	// CDEF_HAVE_TOP
	ldr x8, [x4]
	ldr x9, [x4, #8]
	pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0

	// Middle section
	3:
	tst w6, #1 // CDEF_HAVE_LEFT
	b.eq 2f
	// CDEF_HAVE_LEFT
	tst w6, #2 // CDEF_HAVE_RIGHT
	b.eq 1f
	// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
	0:
	ld1 {v0.h}[0], [x3], #2
	ldr h2, [x1, #\w]
	load_n_incr v1, x1, x2, \w
	subs w5, w5, #1
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	uxtl v2.8h, v2.8b
	str s0, [x0]
	stur \rw\()1, [x0, #4]
	str s2, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	b.gt 0b
	b 3f
	1:
	// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
	ld1 {v0.h}[0], [x3], #2
	load_n_incr v1, x1, x2, \w
	subs w5, w5, #1
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	str s0, [x0]
	stur \rw\()1, [x0, #4]
	str s31, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	b.gt 1b
	b 3f
	2:
	tst w6, #2 // CDEF_HAVE_RIGHT
	b.eq 1f
	// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
	0:
	ldr h1, [x1, #\w]
	load_n_incr v0, x1, x2, \w
	subs w5, w5, #1
	uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	str s31, [x0]
	stur \rw\()0, [x0, #4]
	str s1, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	b.gt 0b
	b 3f
	1:
	// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
	load_n_incr v0, x1, x2, \w
	subs w5, w5, #1
	uxtl v0.8h, v0.8b
	str s31, [x0]
	stur \rw\()0, [x0, #4]
	str s31, [x0, #4+2*\w]
	add x0, x0, #2*\stride
	b.gt 1b

	3:
	tst w6, #8 // CDEF_HAVE_BOTTOM
	b.ne 1f
	// !CDEF_HAVE_BOTTOM
	st1 {v30.8h, v31.8h}, [x0], #32
	.if \w == 8
	st1 {v30.8h, v31.8h}, [x0], #32
	.endif
	ret
	1:
	// CDEF_HAVE_BOTTOM
	add x9, x1, x2
	pad_top_bottom x1, x9, \w, \stride, \rn, \rw, 1
	endfunc
	.endm

	padding_func 8, 16, d, q
	padding_func 4, 8, s, d

	.macro dir_table w, stride
	const directions\w
	.byte -1 * \stride + 1, -2 * \stride + 2
	.byte 0 * \stride + 1, -1 * \stride + 2
	.byte 0 * \stride + 1, 0 * \stride + 2
	.byte 0 * \stride + 1, 1 * \stride + 2
	.byte 1 * \stride + 1, 2 * \stride + 2
	.byte 1 * \stride + 0, 2 * \stride + 1
	.byte 1 * \stride + 0, 2 * \stride + 0
	.byte 1 * \stride + 0, 2 * \stride - 1
	// Repeated, to avoid & 7
	.byte -1 * \stride + 1, -2 * \stride + 2
	.byte 0 * \stride + 1, -1 * \stride + 2
	.byte 0 * \stride + 1, 0 * \stride + 2
	.byte 0 * \stride + 1, 1 * \stride + 2
	.byte 1 * \stride + 1, 2 * \stride + 2
	.byte 1 * \stride + 0, 2 * \stride + 1
	endconst
	.endm

	dir_table 8, 16
	dir_table 4, 8

	const pri_taps
	.byte 4, 2, 3, 3
	endconst

	.macro load_px d1, d2, w
	.if \w == 8
	add x6, x2, w9, sxtb #1 // x + off
	sub x9, x2, w9, sxtb #1 // x - off
	ld1 {\d1\().8h}, [x6] // p0
	ld1 {\d2\().8h}, [x9] // p1
	.else
	add x6, x2, w9, sxtb #1 // x + off
	sub x9, x2, w9, sxtb #1 // x - off
	ld1 {\d1\().4h}, [x6] // p0
	add x6, x6, #2*8 // += stride
	ld1 {\d2\().4h}, [x9] // p1
	add x9, x9, #2*8 // += stride
	ld1 {\d1\().d}[1], [x6] // p0
	ld1 {\d2\().d}[1], [x9] // p1
	.endif
	.endm
	.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
	umin v2.8h, v2.8h, \s1\().8h
	smax v3.8h, v3.8h, \s1\().8h
	umin v2.8h, v2.8h, \s2\().8h
	smax v3.8h, v3.8h, \s2\().8h

	cbz \threshold, 3f
	uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
	uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
	ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
	ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
	uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
	uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
	sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
	sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
	neg v16.8h, v17.8h // -clip
	neg v20.8h, v21.8h // -clip
	smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
	smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
	dup v19.8h, \tap // taps[k]
	smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
	smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
	mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
	mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
	3:
	.endm

	// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
	// const uint16_t *tmp, int pri_strength,
	// int sec_strength, int dir, int damping, int h);
	.macro filter w
	function cdef_filter\w\()_neon, export=1
	movrel x8, pri_taps
	and w9, w3, #1
	add x8, x8, w9, uxtw #1
	movrel x9, directions\w
	add x5, x9, w5, uxtw #1
	movi v30.4h, #15
	dup v28.4h, w6 // damping

	dup v25.8h, w3 // threshold
	dup v27.8h, w4 // threshold
	trn1 v24.4h, v25.4h, v27.4h
	clz v24.4h, v24.4h // clz(threshold)
	sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
	uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
	neg v24.4h, v24.4h // -shift
	dup v26.8h, v24.h[1]
	dup v24.8h, v24.h[0]

	1:
	.if \w == 8
	ld1 {v0.8h}, [x2] // px
	.else
	add x12, x2, #2*8
	ld1 {v0.4h}, [x2] // px
	ld1 {v0.d}[1], [x12] // px
	.endif

	movi v1.8h, #0 // sum
	mov v2.16b, v0.16b // min
	mov v3.16b, v0.16b // max

	// Instead of loading sec_taps 2, 1 from memory, just set it
	// to 2 initially and decrease for the second round.
	mov w11, #2 // sec_taps[0]

	2:
	ldrb w9, [x5] // off1

	load_px v4, v5, \w

	add x5, x5, #4 // +2*2
	ldrb w9, [x5] // off2
	load_px v6, v7, \w

	ldrb w10, [x8] // *pri_taps

	handle_pixel v4, v5, w3, v25.8h, v24.8h, w10

	add x5, x5, #8 // +2*4
	ldrb w9, [x5] // off3
	load_px v4, v5, \w

	handle_pixel v6, v7, w4, v27.8h, v26.8h, w11

	handle_pixel v4, v5, w4, v27.8h, v26.8h, w11

	sub x5, x5, #11 // x8 -= 2*(2+4); x8 += 1;
	subs w11, w11, #1 // sec_tap-- (value)
	add x8, x8, #1 // pri_taps++ (pointer)
	b.ne 2b

	sshr v4.8h, v1.8h, #15 // -(sum < 0)
	add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
	srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
	add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
	smin v0.8h, v0.8h, v3.8h
	smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
	xtn v0.8b, v0.8h
	.if \w == 8
	add x2, x2, #2*16 // tmp += tmp_stride
	subs w7, w7, #1 // h--
	st1 {v0.8b}, [x0], x1
	.else
	st1 {v0.s}[0], [x0], x1
	add x2, x2, #216 // tmp += 2tmp_stride
	subs w7, w7, #2 // h -= 2
	st1 {v0.s}[1], [x0], x1
	.endif

	// Reset pri_taps/sec_taps back to the original point
	sub x5, x5, #2
	sub x8, x8, #2

	b.gt 1b
	ret
	endfunc
	.endm

	filter 8
	filter 4

	const div_table
	.short 840, 420, 280, 210, 168, 140, 120, 105
	endconst

	const alt_fact
	.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
	endconst

	// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
	// unsigned *const var)
	function cdef_find_dir_neon, export=1
	sub sp, sp, #32 // cost
	mov w3, #8
	movi v31.16b, #128
	movi v30.16b, #0
	movi v1.8h, #0 // v0-v1 sum_diag[0]
	movi v3.8h, #0 // v2-v3 sum_diag[1]
	movi v5.8h, #0 // v4-v5 sum_hv[0-1]
	movi v7.8h, #0 // v6-v7 sum_alt[0]
	movi v17.8h, #0 // v16-v17 sum_alt[1]
	movi v18.8h, #0 // v18-v19 sum_alt[2]
	movi v19.8h, #0
	movi v21.8h, #0 // v20-v21 sum_alt[3]

	.irpc i, 01234567
	ld1 {v26.8b}, [x0], x1
	usubl v26.8h, v26.8b, v31.8b

	addv h25, v26.8h // [y]
	rev64 v27.8h, v26.8h
	addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
	add v5.8h, v5.8h, v26.8h // sum_hv[1]
	ext v27.16b, v27.16b, v27.16b, #8 // [-x]
	rev64 v29.4h, v28.4h // [-(x >> 1)]
	ins v4.h[\i], v25.h[0] // sum_hv[0]

	.if \i == 0
	mov v0.16b, v26.16b // sum_diag[0]
	mov v2.16b, v27.16b // sum_diag[1]
	mov v6.16b, v28.16b // sum_alt[0]
	mov v16.16b, v29.16b // sum_alt[1]
	.else
	ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
	ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
	ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
	ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
	add v0.8h, v0.8h, v22.8h // sum_diag[0]
	add v1.8h, v1.8h, v23.8h // sum_diag[0]
	add v2.8h, v2.8h, v24.8h // sum_diag[1]
	add v3.8h, v3.8h, v25.8h // sum_diag[1]
	ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
	ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
	ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
	ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
	add v6.8h, v6.8h, v22.8h // sum_alt[0]
	add v7.4h, v7.4h, v23.4h // sum_alt[0]
	add v16.8h, v16.8h, v24.8h // sum_alt[1]
	add v17.4h, v17.4h, v25.4h // sum_alt[1]
	.endif
	.if \i < 6
	ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
	ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
	add v18.8h, v18.8h, v22.8h // sum_alt[2]
	add v19.4h, v19.4h, v23.4h // sum_alt[2]
	.else
	add v18.8h, v18.8h, v26.8h // sum_alt[2]
	.endif
	.if \i == 0
	mov v20.16b, v26.16b // sum_alt[3]
	.elseif \i == 1
	add v20.8h, v20.8h, v26.8h // sum_alt[3]
	.else
	ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
	ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
	add v20.8h, v20.8h, v24.8h // sum_alt[3]
	add v21.4h, v21.4h, v25.4h // sum_alt[3]
	.endif
	.endr

	movi v31.4s, #105

	smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
	smlal2 v26.4s, v4.8h, v4.8h
	smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
	smlal2 v27.4s, v5.8h, v5.8h
	mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
	mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
	addv s4, v26.4s // cost[2]
	addv s5, v27.4s // cost[6]

	rev64 v1.8h, v1.8h
	rev64 v3.8h, v3.8h
	ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
	ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]

	str s4, [sp, #2*4] // cost[2]
	str s5, [sp, #6*4] // cost[6]

	movrel x4, div_table
	ld1 {v31.8h}, [x4]

	smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
	smull2 v23.4s, v0.8h, v0.8h
	smlal v22.4s, v1.4h, v1.4h
	smlal2 v23.4s, v1.8h, v1.8h
	smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
	smull2 v25.4s, v2.8h, v2.8h
	smlal v24.4s, v3.4h, v3.4h
	smlal2 v25.4s, v3.8h, v3.8h
	uxtl v30.4s, v31.4h // div_table
	uxtl2 v31.4s, v31.8h
	mul v22.4s, v22.4s, v30.4s // cost[0]
	mla v22.4s, v23.4s, v31.4s // cost[0]
	mul v24.4s, v24.4s, v30.4s // cost[4]
	mla v24.4s, v25.4s, v31.4s // cost[4]
	addv s0, v22.4s // cost[0]
	addv s2, v24.4s // cost[4]

	movrel x5, alt_fact
	ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105

	str s0, [sp, #0*4] // cost[0]
	str s2, [sp, #4*4] // cost[4]

	uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
	uxtl v30.4s, v30.4h
	uxtl v31.4s, v31.4h

	.macro cost_alt d1, d2, s1, s2, s3, s4
	smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
	smull2 v23.4s, \s1\().8h, \s1\().8h
	smull v24.4s, \s2\().4h, \s2\().4h
	smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
	smull2 v26.4s, \s3\().8h, \s3\().8h
	smull v27.4s, \s4\().4h, \s4\().4h
	mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
	mla v22.4s, v23.4s, v30.4s
	mla v22.4s, v24.4s, v31.4s
	mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
	mla v25.4s, v26.4s, v30.4s
	mla v25.4s, v27.4s, v31.4s
	addv \d1, v22.4s // *cost_ptr
	addv \d2, v25.4s // *cost_ptr
	.endm
	cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
	cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
	str s6, [sp, #1*4] // cost[1]
	str s16, [sp, #3*4] // cost[3]

	mov w0, #0 // best_dir
	mov w1, v0.s[0] // best_cost
	mov w3, #1 // n

	str s18, [sp, #5*4] // cost[5]
	str s20, [sp, #7*4] // cost[7]

	mov w4, v6.s[0]

	.macro find_best s1, s2, s3
	.ifnb \s2
	mov w5, \s2\().s[0]
	.endif
	cmp w4, w1 // cost[n] > best_cost
	csel w0, w3, w0, gt // best_dir = n
	csel w1, w4, w1, gt // best_cost = cost[n]
	.ifnb \s2
	add w3, w3, #1 // n++
	cmp w5, w1 // cost[n] > best_cost
	mov w4, \s3\().s[0]
	csel w0, w3, w0, gt // best_dir = n
	csel w1, w5, w1, gt // best_cost = cost[n]
	add w3, w3, #1 // n++
	.endif
	.endm
	find_best v6, v4, v16
	find_best v16, v2, v18
	find_best v18, v5, v20
	find_best v20

	eor w3, w0, #4 // best_dir ^4
	ldr w4, [sp, w3, uxtw #2]
	sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
	lsr w1, w1, #10
	str w1, [x2] // *var

	add sp, sp, #32
	ret
	endfunc