third_party/libdav1d/src/arm/64/ipred.S - cobalt - Git at Google

 /*
  * Copyright © 2018, VideoLAN and dav1d authors
  * Copyright © 2019, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "src/arm/asm.S"
 #include "util.S"

 // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
 //                             const int max_width, const int max_height);
 function ipred_dc_128_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_dc_128_tbl)
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         movi            v0.16b,  #128
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 4:
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 8:
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 16:
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
         AARCH64_VALID_JUMP_TARGET
         movi            v1.16b,  #128
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
         AARCH64_VALID_JUMP_TARGET
         movi            v1.16b,  #128
         movi            v2.16b,  #128
         movi            v3.16b,  #128
 64:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret

 L(ipred_dc_128_tbl):
         .hword L(ipred_dc_128_tbl) - 640b
         .hword L(ipred_dc_128_tbl) - 320b
         .hword L(ipred_dc_128_tbl) -  16b
         .hword L(ipred_dc_128_tbl) -   8b
         .hword L(ipred_dc_128_tbl) -   4b
 endfunc

 // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                        const pixel *const topleft,
 //                        const int width, const int height, const int a,
 //                        const int max_width, const int max_height);
 function ipred_v_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_v_tbl)
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #1
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2]
 4:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
 8:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 160:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
 16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
 64:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret

 L(ipred_v_tbl):
         .hword L(ipred_v_tbl) - 640b
         .hword L(ipred_v_tbl) - 320b
         .hword L(ipred_v_tbl) - 160b
         .hword L(ipred_v_tbl) -  80b
         .hword L(ipred_v_tbl) -  40b
 endfunc

 // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                        const pixel *const topleft,
 //                        const int width, const int height, const int a,
 //                        const int max_width, const int max_height);
 function ipred_h_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_h_tbl)
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         sub             x2,  x2,  #4
         sub             x5,  x5,  w3, uxtw
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 4:
         AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.s}[0],  [x0], x1
         st1             {v2.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 8:
         AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.8b},  [x0], x1
         st1             {v2.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 16:
         AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 32:
         AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
         str             q1,  [x0, #16]
         str             q0,  [x6, #16]
         st1             {v1.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            32b
         ret
 64:
         AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         stp             q3,  q3,  [x0, #32]
         stp             q2,  q2,  [x6, #32]
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
         str             q1,  [x0, #16]
         str             q0,  [x6, #16]
         stp             q1,  q1,  [x0, #32]
         stp             q0,  q0,  [x6, #32]
         st1             {v1.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            64b
         ret

 L(ipred_h_tbl):
         .hword L(ipred_h_tbl) - 64b
         .hword L(ipred_h_tbl) - 32b
         .hword L(ipred_h_tbl) - 16b
         .hword L(ipred_h_tbl) -  8b
         .hword L(ipred_h_tbl) -  4b
 endfunc

 // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
 //                             const int max_width, const int max_height);
 function ipred_dc_top_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_dc_top_tbl)
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #1
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.8b,   v0.b[0]
 4:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.8b,   v0.b[0]
 8:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 160:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         rshrn           v0.8b,   v0.8h,   #4
         dup             v0.16b,  v0.b[0]
 16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             v2.4h,   v0.4h,   v1.4h
         rshrn           v2.8b,   v2.8h,   #5
         dup             v0.16b,  v2.b[0]
         dup             v1.16b,  v2.b[0]
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v4.4h,   v0.4h,   v1.4h
         add             v5.4h,   v2.4h,   v3.4h
         add             v4.4h,   v4.4h,   v5.4h
         rshrn           v4.8b,   v4.8h,   #6
         dup             v0.16b,  v4.b[0]
         dup             v1.16b,  v4.b[0]
         dup             v2.16b,  v4.b[0]
         dup             v3.16b,  v4.b[0]
 64:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
         ret

 L(ipred_dc_top_tbl):
         .hword L(ipred_dc_top_tbl) - 640b
         .hword L(ipred_dc_top_tbl) - 320b
         .hword L(ipred_dc_top_tbl) - 160b
         .hword L(ipred_dc_top_tbl) -  80b
         .hword L(ipred_dc_top_tbl) -  40b
 endfunc

 // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height, const int a,
 //                              const int max_width, const int max_height);
 function ipred_dc_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w3,  w3
         clz             w7,  w4
         adr             x5,  L(ipred_dc_left_tbl)
         sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
         sub             w7,  w7,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         ldrh            w7,  [x5, w7, uxtw #1]
         sub             x3,  x5,  w3, uxtw
         sub             x5,  x5,  w7, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5

 L(ipred_dc_left_h4):
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w4):
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            L(ipred_dc_left_w4)
         ret

 L(ipred_dc_left_h8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w8):
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            L(ipred_dc_left_w8)
         ret

 L(ipred_dc_left_h16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         rshrn           v0.8b,   v0.8h,   #4
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w16):
         AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            L(ipred_dc_left_w16)
         ret

 L(ipred_dc_left_h32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             v0.4h,   v0.4h,   v1.4h
         rshrn           v0.8b,   v0.8h,   #5
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w32):
         AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
 1:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            1b
         ret

 L(ipred_dc_left_h64):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v0.4h,   v0.4h,   v1.4h
         add             v2.4h,   v2.4h,   v3.4h
         add             v0.4h,   v0.4h,   v2.4h
         rshrn           v0.8b,   v0.8h,   #6
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w64):
         AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
 1:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            1b
         ret

 L(ipred_dc_left_tbl):
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
 endfunc

 // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const pixel *const topleft,
 //                         const int width, const int height, const int a,
 //                         const int max_width, const int max_height);
 function ipred_dc_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         add             w7,  w3,  w4             // width + height
         clz             w3,  w3
         clz             w6,  w4
         dup             v16.8h, w7               // width + height
         adr             x5,  L(ipred_dc_tbl)
         rbit            w7,  w7                  // rbit(width + height)
         sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
         sub             w6,  w6,  #25
         clz             w7,  w7                  // ctz(width + height)
         ldrh            w3,  [x5, w3, uxtw #1]
         ldrh            w6,  [x5, w6, uxtw #1]
         neg             w7,  w7                  // -ctz(width + height)
         sub             x3,  x5,  w3, uxtw
         sub             x5,  x5,  w6, uxtw
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5

 L(ipred_dc_h4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2], #4
         ins             v0.s[1], wzr
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.s}[0],  [x2]
         ins             v1.s[1], wzr
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.8b
         cmp             w4,  #4
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16
         mov             w16, #(0x3334/2)
         movk            w16, #(0x5556/2), lsl #16
         add             w17, w4,  w4  // w17 = 2*h = 16 or 32
         lsr             w16, w16, w17
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8b,   v0.b[0]
 2:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            2b
         ret

 L(ipred_dc_h8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2], #8
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8b},  [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.8b
         cmp             w4,  #8
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/16/32
         cmp             w4,  #32
         mov             w16, #(0x3334/2)
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8b,   v0.b[0]
 2:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            2b
         ret

 L(ipred_dc_h16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2], #16
         uaddlv          h0,      v0.16b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         cmp             w4,  #16
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/8/32/64
         tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
         mov             w16, #(0x3334/2)
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.16b,  v0.b[0]
 2:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            2b
         ret

 L(ipred_dc_h32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2], #32
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             x2,  x2,  #1
         add             v0.4h,   v0.4h,   v1.4h
         br              x3
 L(ipred_dc_w32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16/64
         cmp             w4,  #8
         mov             w16, #(0x3334/2)
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
         dup             v0.16b,  v4.b[0]
         dup             v1.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            2b
         ret

 L(ipred_dc_h64):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v0.4h,   v0.4h,   v1.4h
         add             v2.4h,   v2.4h,   v3.4h
         add             x2,  x2,  #1
         add             v0.4h,   v0.4h,   v2.4h
         br              x3
 L(ipred_dc_w64):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         uaddlv          h4,      v4.16b
         add             v1.4h,   v1.4h,   v2.4h
         add             v3.4h,   v3.4h,   v4.4h
         cmp             w4,  #64
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v3.4h
         ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 16/32
         mov             w16, #(0x5556/2)
         movk            w16, #(0x3334/2), lsl #16
         lsr             w16, w16, w4
         dup             v16.4h,  w16
         sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
         dup             v0.16b,  v4.b[0]
         dup             v1.16b,  v4.b[0]
         dup             v2.16b,  v4.b[0]
         dup             v3.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            2b
         ret

 L(ipred_dc_tbl):
         .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
         .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
         .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
         .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
         .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
         .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
         .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
         .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
         .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
         .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
 endfunc

 // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                            const pixel *const topleft,
 //                            const int width, const int height, const int a,
 //                            const int max_width, const int max_height);
 function ipred_paeth_8bpc_neon, export=1
         clz             w9,  w3
         adr             x5,  L(ipred_paeth_tbl)
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.16b},  [x2]
         add             x8,  x2,  #1
         sub             x2,  x2,  #4
         sub             x5,  x5,  w9, uxtw
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v5.4s},  [x8]
         usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
 4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         zip1            v0.2s,   v0.2s,   v1.2s
         zip1            v2.2s,   v2.2s,   v3.2s
         uaddw           v16.8h,  v6.8h,   v0.8b
         uaddw           v17.8h,  v6.8h,   v2.8b
         sqxtun          v16.8b,  v16.8h           // base
         sqxtun2         v16.16b, v17.8h
         zip1            v0.2d,   v0.2d,   v2.2d
         uabd            v20.16b, v5.16b,  v16.16b // tdiff
         uabd            v22.16b, v4.16b,  v16.16b // tldiff
         uabd            v16.16b, v0.16b,  v16.16b // ldiff
         umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
         cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
         cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
         bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
         bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
         st1             {v20.s}[3], [x0], x1
         st1             {v20.s}[2], [x6], x1
         subs            w4,  w4,  #4
         st1             {v20.s}[1], [x0], x1
         st1             {v20.s}[0], [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v5.2d},  [x8]
         usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
 8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         uaddw           v16.8h,  v6.8h,   v0.8b
         uaddw           v17.8h,  v6.8h,   v1.8b
         uaddw           v18.8h,  v6.8h,   v2.8b
         uaddw           v19.8h,  v6.8h,   v3.8b
         sqxtun          v16.8b,  v16.8h           // base
         sqxtun2         v16.16b, v17.8h
         sqxtun          v18.8b,  v18.8h
         sqxtun2         v18.16b, v19.8h
         zip1            v2.2d,   v2.2d,   v3.2d
         zip1            v0.2d,   v0.2d,   v1.2d
         uabd            v21.16b, v5.16b,  v18.16b // tdiff
         uabd            v20.16b, v5.16b,  v16.16b
         uabd            v23.16b, v4.16b,  v18.16b // tldiff
         uabd            v22.16b, v4.16b,  v16.16b
         uabd            v17.16b, v2.16b,  v18.16b // ldiff
         uabd            v16.16b, v0.16b,  v16.16b
         umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
         umin            v18.16b, v20.16b, v22.16b
         cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
         cmhs            v20.16b, v22.16b, v20.16b
         cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
         cmhs            v16.16b, v18.16b, v16.16b
         bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
         bsl             v20.16b, v5.16b,  v4.16b
         bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
         bit             v20.16b, v0.16b,  v16.16b
         st1             {v21.d}[1], [x0], x1
         st1             {v21.d}[0], [x6], x1
         subs            w4,  w4,  #4
         st1             {v20.d}[1], [x0], x1
         st1             {v20.d}[0], [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v5.16b},  [x8], #16
         mov             w9,  w3
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
 1:
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
 2:
         usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
         usubl2          v7.8h,   v5.16b,  v4.16b
         uaddw           v24.8h,  v6.8h,   v0.8b
         uaddw           v25.8h,  v7.8h,   v0.8b
         uaddw           v26.8h,  v6.8h,   v1.8b
         uaddw           v27.8h,  v7.8h,   v1.8b
         uaddw           v28.8h,  v6.8h,   v2.8b
         uaddw           v29.8h,  v7.8h,   v2.8b
         uaddw           v30.8h,  v6.8h,   v3.8b
         uaddw           v31.8h,  v7.8h,   v3.8b
         sqxtun          v17.8b,  v26.8h           // base
         sqxtun2         v17.16b, v27.8h
         sqxtun          v16.8b,  v24.8h
         sqxtun2         v16.16b, v25.8h
         sqxtun          v19.8b,  v30.8h
         sqxtun2         v19.16b, v31.8h
         sqxtun          v18.8b,  v28.8h
         sqxtun2         v18.16b, v29.8h
         uabd            v23.16b, v5.16b,  v19.16b // tdiff
         uabd            v22.16b, v5.16b,  v18.16b
         uabd            v21.16b, v5.16b,  v17.16b
         uabd            v20.16b, v5.16b,  v16.16b
         uabd            v27.16b, v4.16b,  v19.16b // tldiff
         uabd            v26.16b, v4.16b,  v18.16b
         uabd            v25.16b, v4.16b,  v17.16b
         uabd            v24.16b, v4.16b,  v16.16b
         uabd            v19.16b, v3.16b,  v19.16b // ldiff
         uabd            v18.16b, v2.16b,  v18.16b
         uabd            v17.16b, v1.16b,  v17.16b
         uabd            v16.16b, v0.16b,  v16.16b
         umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
         umin            v30.16b, v22.16b, v26.16b
         umin            v29.16b, v21.16b, v25.16b
         umin            v28.16b, v20.16b, v24.16b
         cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
         cmhs            v22.16b, v26.16b, v22.16b
         cmhs            v21.16b, v25.16b, v21.16b
         cmhs            v20.16b, v24.16b, v20.16b
         cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
         cmhs            v18.16b, v30.16b, v18.16b
         cmhs            v17.16b, v29.16b, v17.16b
         cmhs            v16.16b, v28.16b, v16.16b
         bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
         bsl             v22.16b, v5.16b,  v4.16b
         bsl             v21.16b, v5.16b,  v4.16b
         bsl             v20.16b, v5.16b,  v4.16b
         bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
         bit             v22.16b, v2.16b,  v18.16b
         bit             v21.16b, v1.16b,  v17.16b
         bit             v20.16b, v0.16b,  v16.16b
         subs            w3,  w3,  #16
         st1             {v23.16b}, [x0],  #16
         st1             {v22.16b}, [x6],  #16
         st1             {v21.16b}, [x5],  #16
         st1             {v20.16b}, [x10], #16
         b.le            8f
         ld1             {v5.16b},  [x8], #16
         b               2b
 8:
         subs            w4,  w4,  #4
         b.le            9f
         // End of horizontal loop, move pointers to next four rows
         sub             x8,  x8,  w9, uxtw
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         // Load the top row as early as possible
         ld1             {v5.16b},  [x8], #16
         add             x5,  x5,  x1
         add             x10, x10, x1
         mov             w3,  w9
         b               1b
 9:
         ret

 L(ipred_paeth_tbl):
         .hword L(ipred_paeth_tbl) - 640b
         .hword L(ipred_paeth_tbl) - 320b
         .hword L(ipred_paeth_tbl) - 160b
         .hword L(ipred_paeth_tbl) -  80b
         .hword L(ipred_paeth_tbl) -  40b
 endfunc

 // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int a,
 //                             const int max_width, const int max_height);
 function ipred_smooth_8bpc_neon, export=1
         movrel          x10, X(sm_weights)
         add             x11, x10, w4, uxtw
         add             x10, x10, w3, uxtw
         clz             w9,  w3
         adr             x5,  L(ipred_smooth_tbl)
         sub             x12, x2,  w4, uxtw
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.16b},  [x12] // bottom
         add             x8,  x2,  #1
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2s}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         dup             v5.16b,  v6.b[3]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
 4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
         zip1            v0.2s,   v3.2s,   v2.2s
         zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
         zip1            v18.2s,  v18.2s,  v19.2s
         shll            v22.8h,  v4.8b,   #8      // bottom*256
         shll            v23.8h,  v4.8b,   #8
         usubl           v0.8h,   v0.8b,   v5.8b   // left-right
         usubl           v1.8h,   v1.8b,   v5.8b
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v18.8h,  v18.8b
         mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v1.8h,   v7.8h
         mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v23.8h,  v6.8h,   v18.8h
         uhadd           v20.8h,  v20.8h,  v22.8h
         uhadd           v21.8h,  v21.8h,  v23.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn           v21.8b,  v21.8h,  #8
         st1             {v20.s}[0], [x0], x1
         st1             {v20.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.s}[0], [x0], x1
         st1             {v21.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8b}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         dup             v5.16b,  v6.b[7]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
 8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         shll            v22.8h,  v5.8b,   #8
         shll            v23.8h,  v5.8b,   #8
         usubl           v0.8h,   v0.8b,   v5.8b   // left-right
         usubl           v1.8h,   v1.8b,   v5.8b
         usubl           v2.8h,   v2.8b,   v5.8b
         usubl           v3.8h,   v3.8b,   v5.8b
         shll            v24.8h,  v4.8b,   #8      // bottom*256
         shll            v25.8h,  v4.8b,   #8
         shll            v26.8h,  v4.8b,   #8
         shll            v27.8h,  v4.8b,   #8
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v17.8h,  v17.8b
         uxtl            v18.8h,  v18.8b
         uxtl            v19.8h,  v19.8b
         mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
         mla             v22.8h,  v1.8h,   v7.8h
         mla             v23.8h,  v0.8h,   v7.8h
         mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v25.8h,  v6.8h,   v17.8h
         mla             v26.8h,  v6.8h,   v18.8h
         mla             v27.8h,  v6.8h,   v19.8h
         uhadd           v20.8h,  v20.8h,  v24.8h
         uhadd           v21.8h,  v21.8h,  v25.8h
         uhadd           v22.8h,  v22.8h,  v26.8h
         uhadd           v23.8h,  v23.8h,  v27.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn           v21.8b,  v21.8h,  #8
         rshrn           v22.8b,  v22.8h,  #8
         rshrn           v23.8b,  v23.8h,  #8
         st1             {v20.8b}, [x0], x1
         st1             {v21.8b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v22.8b}, [x0], x1
         st1             {v23.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
         AARCH64_VALID_JUMP_TARGET
         add             x12, x2,  w3, uxtw
         sub             x2,  x2,  #2
         mov             x7,  #-2
         ld1r            {v5.16b}, [x12]           // right
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3

 1:
         ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
         ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
         usubl           v0.8h,   v0.8b,   v5.8b   // left-right
         usubl           v1.8h,   v1.8b,   v5.8b
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v17.8h,  v17.8b
 2:
         ld1             {v7.16b}, [x10],  #16     // weights_hor
         ld1             {v3.16b}, [x8],   #16     // top
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         shll            v22.8h,  v5.8b,   #8
         shll            v23.8h,  v5.8b,   #8
         uxtl            v6.8h,   v7.8b            // weights_hor
         uxtl2           v7.8h,   v7.16b
         usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
         usubl2          v3.8h,   v3.16b,  v4.16b
         mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
         mla             v22.8h,  v0.8h,   v6.8h
         mla             v23.8h,  v0.8h,   v7.8h
         shll            v24.8h,  v4.8b,   #8      // bottom*256
         shll            v25.8h,  v4.8b,   #8
         shll            v26.8h,  v4.8b,   #8
         shll            v27.8h,  v4.8b,   #8
         mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v25.8h,  v3.8h,   v16.8h
         mla             v26.8h,  v2.8h,   v17.8h
         mla             v27.8h,  v3.8h,   v17.8h
         uhadd           v20.8h,  v20.8h,  v24.8h
         uhadd           v21.8h,  v21.8h,  v25.8h
         uhadd           v22.8h,  v22.8h,  v26.8h
         uhadd           v23.8h,  v23.8h,  v27.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn2          v20.16b, v21.8h,  #8
         rshrn           v22.8b,  v22.8h,  #8
         rshrn2          v22.16b, v23.8h,  #8
         subs            w3,  w3,  #16
         st1             {v20.16b}, [x0],  #16
         st1             {v22.16b}, [x6],  #16
         b.gt            2b
         subs            w4,  w4,  #2
         b.le            9f
         sub             x8,  x8,  w9, uxtw
         sub             x10, x10, w9, uxtw
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         mov             w3,  w9
         b               1b
 9:
         ret

 L(ipred_smooth_tbl):
         .hword L(ipred_smooth_tbl) - 640b
         .hword L(ipred_smooth_tbl) - 320b
         .hword L(ipred_smooth_tbl) - 160b
         .hword L(ipred_smooth_tbl) -  80b
         .hword L(ipred_smooth_tbl) -  40b
 endfunc

 // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,
 //                               const int max_width, const int max_height);
 function ipred_smooth_v_8bpc_neon, export=1
         movrel          x7,  X(sm_weights)
         add             x7,  x7,  w4, uxtw
         clz             w9,  w3
         adr             x5,  L(ipred_smooth_v_tbl)
         sub             x8,  x2,  w4, uxtw
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.16b},  [x8] // bottom
         add             x2,  x2,  #1
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2s}, [x2]             // top
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 4:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         shll            v22.8h,  v4.8b,   #8      // bottom*256
         shll            v23.8h,  v4.8b,   #8
         zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
         zip1            v18.2s,  v18.2s,  v19.2s
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v18.8h,  v18.8b
         mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v23.8h,  v6.8h,   v18.8h
         rshrn           v22.8b,  v22.8h,  #8
         rshrn           v23.8b,  v23.8h,  #8
         st1             {v22.s}[0], [x0], x1
         st1             {v22.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v23.s}[0], [x0], x1
         st1             {v23.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8b}, [x2]             // top
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 8:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         shll            v24.8h,  v4.8b,   #8      // bottom*256
         shll            v25.8h,  v4.8b,   #8
         shll            v26.8h,  v4.8b,   #8
         shll            v27.8h,  v4.8b,   #8
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v17.8h,  v17.8b
         uxtl            v18.8h,  v18.8b
         uxtl            v19.8h,  v19.8b
         mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v25.8h,  v6.8h,   v17.8h
         mla             v26.8h,  v6.8h,   v18.8h
         mla             v27.8h,  v6.8h,   v19.8h
         rshrn           v24.8b,  v24.8h,  #8
         rshrn           v25.8b,  v25.8h,  #8
         rshrn           v26.8b,  v26.8h,  #8
         rshrn           v27.8b,  v27.8h,  #8
         st1             {v24.8b}, [x0], x1
         st1             {v25.8b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v26.8b}, [x0], x1
         st1             {v27.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
         AARCH64_VALID_JUMP_TARGET
         // Set up pointers for four rows in parallel; x0, x6, x5, x8
         add             x5,  x0,  x1
         add             x8,  x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3

 1:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
         uxtl            v16.8h,  v16.8b           // weights_ver
         uxtl            v17.8h,  v17.8b
         uxtl            v18.8h,  v18.8b
         uxtl            v19.8h,  v19.8b
 2:
         ld1             {v3.16b}, [x2],   #16     // top
         shll            v20.8h,  v4.8b,   #8      // bottom*256
         shll            v21.8h,  v4.8b,   #8
         shll            v22.8h,  v4.8b,   #8
         shll            v23.8h,  v4.8b,   #8
         shll            v24.8h,  v4.8b,   #8
         shll            v25.8h,  v4.8b,   #8
         shll            v26.8h,  v4.8b,   #8
         shll            v27.8h,  v4.8b,   #8
         usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
         usubl2          v3.8h,   v3.16b,  v4.16b
         mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
         mla             v21.8h,  v3.8h,   v16.8h
         mla             v22.8h,  v2.8h,   v17.8h
         mla             v23.8h,  v3.8h,   v17.8h
         mla             v24.8h,  v2.8h,   v18.8h
         mla             v25.8h,  v3.8h,   v18.8h
         mla             v26.8h,  v2.8h,   v19.8h
         mla             v27.8h,  v3.8h,   v19.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn2          v20.16b, v21.8h,  #8
         rshrn           v22.8b,  v22.8h,  #8
         rshrn2          v22.16b, v23.8h,  #8
         rshrn           v24.8b,  v24.8h,  #8
         rshrn2          v24.16b, v25.8h,  #8
         rshrn           v26.8b,  v26.8h,  #8
         rshrn2          v26.16b, v27.8h,  #8
         subs            w3,  w3,  #16
         st1             {v20.16b}, [x0],  #16
         st1             {v22.16b}, [x6],  #16
         st1             {v24.16b}, [x5],  #16
         st1             {v26.16b}, [x8],  #16
         b.gt            2b
         subs            w4,  w4,  #4
         b.le            9f
         sub             x2,  x2,  w9, uxtw
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         add             x5,  x5,  x1
         add             x8,  x8,  x1
         mov             w3,  w9
         b               1b
 9:
         ret

 L(ipred_smooth_v_tbl):
         .hword L(ipred_smooth_v_tbl) - 640b
         .hword L(ipred_smooth_v_tbl) - 320b
         .hword L(ipred_smooth_v_tbl) - 160b
         .hword L(ipred_smooth_v_tbl) -  80b
         .hword L(ipred_smooth_v_tbl) -  40b
 endfunc

 // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height, const int a,
 //                               const int max_width, const int max_height);
 function ipred_smooth_h_8bpc_neon, export=1
         movrel          x8,  X(sm_weights)
         add             x8,  x8,  w3, uxtw
         clz             w9,  w3
         adr             x5,  L(ipred_smooth_h_tbl)
         add             x12, x2,  w3, uxtw
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v5.16b},  [x12] // right
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v7.2s}, [x8]             // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         uxtl            v7.8h,   v7.8b            // weights_hor
 4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
         zip1            v0.2s,   v3.2s,   v2.2s
         usubl           v0.8h,   v0.8b,   v5.8b   // left-right
         usubl           v1.8h,   v1.8b,   v5.8b
         mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v1.8h,   v7.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn           v21.8b,  v21.8h,  #8
         st1             {v20.s}[0], [x0], x1
         st1             {v20.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.s}[0], [x0], x1
         st1             {v21.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v7.8b}, [x8]             // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         uxtl            v7.8h,   v7.8b            // weights_hor
 8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         shll            v22.8h,  v5.8b,   #8
         shll            v23.8h,  v5.8b,   #8
         usubl           v3.8h,   v3.8b,   v5.8b   // left-right
         usubl           v2.8h,   v2.8b,   v5.8b
         usubl           v1.8h,   v1.8b,   v5.8b
         usubl           v0.8h,   v0.8b,   v5.8b
         mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
         mla             v22.8h,  v1.8h,   v7.8h
         mla             v23.8h,  v0.8h,   v7.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn           v21.8b,  v21.8h,  #8
         rshrn           v22.8b,  v22.8h,  #8
         rshrn           v23.8b,  v23.8h,  #8
         st1             {v20.8b}, [x0], x1
         st1             {v21.8b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v22.8b}, [x0], x1
         st1             {v23.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
         AARCH64_VALID_JUMP_TARGET
         sub             x2,  x2,  #4
         mov             x7,  #-4
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3

 1:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
         usubl           v0.8h,   v0.8b,   v5.8b   // left-right
         usubl           v1.8h,   v1.8b,   v5.8b
         usubl           v2.8h,   v2.8b,   v5.8b
         usubl           v3.8h,   v3.8b,   v5.8b
 2:
         ld1             {v7.16b}, [x8],   #16     // weights_hor
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
         shll            v22.8h,  v5.8b,   #8
         shll            v23.8h,  v5.8b,   #8
         shll            v24.8h,  v5.8b,   #8
         shll            v25.8h,  v5.8b,   #8
         shll            v26.8h,  v5.8b,   #8
         shll            v27.8h,  v5.8b,   #8
         uxtl            v6.8h,   v7.8b            // weights_hor
         uxtl2           v7.8h,   v7.16b
         mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
         mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
         mla             v22.8h,  v2.8h,   v6.8h
         mla             v23.8h,  v2.8h,   v7.8h
         mla             v24.8h,  v1.8h,   v6.8h
         mla             v25.8h,  v1.8h,   v7.8h
         mla             v26.8h,  v0.8h,   v6.8h
         mla             v27.8h,  v0.8h,   v7.8h
         rshrn           v20.8b,  v20.8h,  #8
         rshrn2          v20.16b, v21.8h,  #8
         rshrn           v22.8b,  v22.8h,  #8
         rshrn2          v22.16b, v23.8h,  #8
         rshrn           v24.8b,  v24.8h,  #8
         rshrn2          v24.16b, v25.8h,  #8
         rshrn           v26.8b,  v26.8h,  #8
         rshrn2          v26.16b, v27.8h,  #8
         subs            w3,  w3,  #16
         st1             {v20.16b}, [x0],  #16
         st1             {v22.16b}, [x6],  #16
         st1             {v24.16b}, [x5],  #16
         st1             {v26.16b}, [x10], #16
         b.gt            2b
         subs            w4,  w4,  #4
         b.le            9f
         sub             x8,  x8,  w9, uxtw
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         add             x5,  x5,  x1
         add             x10, x10, x1
         mov             w3,  w9
         b               1b
 9:
         ret

 L(ipred_smooth_h_tbl):
         .hword L(ipred_smooth_h_tbl) - 640b
         .hword L(ipred_smooth_h_tbl) - 320b
         .hword L(ipred_smooth_h_tbl) - 160b
         .hword L(ipred_smooth_h_tbl) -  80b
         .hword L(ipred_smooth_h_tbl) -  40b
 endfunc

 const padding_mask_buf
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
         .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 padding_mask:
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 endconst

 // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
 //                                       const pixel *const in, const int end);
 function ipred_z1_upsample_edge_8bpc_neon, export=1
         movrel          x4,  padding_mask
         ld1             {v0.16b},  [x2]           // in[]
         add             x5,  x2,  w3,  uxtw       // in[end]
         sub             x4,  x4,  w3,  uxtw

         ld1r            {v1.16b},  [x5]           // padding
         ld1             {v3.16b},  [x4]           // padding_mask

         movi            v31.8h,  #9

         bit             v0.16b,  v1.16b,  v3.16b  // padded in[]

         ext             v4.16b,  v0.16b,  v1.16b,  #1
         ext             v5.16b,  v0.16b,  v1.16b,  #2
         ext             v6.16b,  v0.16b,  v1.16b,  #3

         uaddl           v16.8h,  v4.8b,   v5.8b   // in[i+1] + in[i+2]
         uaddl2          v17.8h,  v4.16b,  v5.16b
         uaddl           v18.8h,  v0.8b,   v6.8b   // in[i+0] + in[i+3]
         uaddl2          v19.8h,  v0.16b,  v6.16b
         mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
         mul             v17.8h,  v17.8h,  v31.8h
         sub             v16.8h,  v16.8h,  v18.8h
         sub             v17.8h,  v17.8h,  v19.8h

         sqrshrun        v16.8b,  v16.8h,  #4
         sqrshrun2       v16.16b, v17.8h,  #4

         zip1            v0.16b,  v4.16b,  v16.16b
         zip2            v1.16b,  v4.16b,  v16.16b

         st1             {v0.16b, v1.16b}, [x0]

         ret
 endfunc

 const edge_filter
         .byte 0, 4, 8, 0
         .byte 0, 5, 6, 0
 // Leaving out the coeffs for strength=3
 //      .byte 2, 4, 4, 0
 endconst

 // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
 //                                     const pixel *const in, const int end,
 //                                     const int strength);
 function ipred_z1_filter_edge_8bpc_neon, export=1
         cmp             w4, #3
         b.eq            L(fivetap)                // if (strength == 3) goto fivetap

         movrel          x5,  edge_filter, -3
         add             x5,  x5,  w4,  uxtw #2    // edge_filter + (strength - 1)*4 + 1

         ld1             {v31.h}[0], [x5]          // kernel[1-2]

         ld1             {v0.16b}, [x2], #16

         dup             v30.16b, v31.b[0]
         dup             v31.16b, v31.b[1]
 1:
         // in[end], is the last valid pixel. We produce 16 pixels out by
         // using 18 pixels in - the last pixel used is [17] of the ones
         // read/buffered.
         cmp             w3,  #17
         ld1             {v1.16b}, [x2], #16
         b.lt            2f
         ext             v2.16b,  v0.16b,  v1.16b,  #1
         ext             v3.16b,  v0.16b,  v1.16b,  #2
         umull           v4.8h,   v0.8b,   v30.8b
         umlal           v4.8h,   v2.8b,   v31.8b
         umlal           v4.8h,   v3.8b,   v30.8b
         umull2          v5.8h,   v0.16b,  v30.16b
         umlal2          v5.8h,   v2.16b,  v31.16b
         umlal2          v5.8h,   v3.16b,  v30.16b
         subs            w1,  w1,  #16
         mov             v0.16b,  v1.16b
         rshrn           v4.8b,   v4.8h,   #4
         rshrn2          v4.16b,  v5.8h,   #4
         sub             w3,  w3,  #16
         st1             {v4.16b}, [x0], #16
         b.gt            1b
         ret
 2:
         // Right padding

         // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
         movrel          x5,  padding_mask
         sub             w6,  w3,  #32
         sub             x5,  x5,  w3,  uxtw
         add             x6,  x2,  w6,  sxtw

         ld1             {v2.16b, v3.16b}, [x5]    // padding_mask

         ld1r            {v4.16b}, [x6]
         bit             v0.16b,  v4.16b,  v2.16b  // Pad v0-v1
         bit             v1.16b,  v4.16b,  v3.16b

         // Filter one block
         ext             v2.16b,  v0.16b,  v1.16b,  #1
         ext             v3.16b,  v0.16b,  v1.16b,  #2
         umull           v4.8h,   v0.8b,   v30.8b
         umlal           v4.8h,   v2.8b,   v31.8b
         umlal           v4.8h,   v3.8b,   v30.8b
         umull2          v5.8h,   v0.16b,  v30.16b
         umlal2          v5.8h,   v2.16b,  v31.16b
         umlal2          v5.8h,   v3.16b,  v30.16b
         subs            w1,  w1,  #16
         rshrn           v4.8b,   v4.8h,   #4
         rshrn2          v4.16b,  v5.8h,   #4
         st1             {v4.16b}, [x0], #16
         b.le            9f
 5:
         // After one block, any remaining output would only be filtering
         // padding - thus just store the padding.
         subs            w1,  w1,  #16
         st1             {v1.16b}, [x0], #16
         b.gt            5b
 9:
         ret

 L(fivetap):
         sub             x2,  x2,  #1              // topleft -= 1
         movi            v29.16b, #2
         ld1             {v0.16b}, [x2], #16
         movi            v30.16b, #4
         movi            v31.16b, #4
         ins             v0.b[0], v0.b[1]
 1:
         // in[end+1], is the last valid pixel. We produce 16 pixels out by
         // using 20 pixels in - the last pixel used is [19] of the ones
         // read/buffered.
         cmp             w3,  #18
         ld1             {v1.16b}, [x2], #16
         b.lt            2f                        // if (end + 1 < 19)
         ext             v2.16b,  v0.16b,  v1.16b,  #1
         ext             v3.16b,  v0.16b,  v1.16b,  #2
         ext             v4.16b,  v0.16b,  v1.16b,  #3
         ext             v5.16b,  v0.16b,  v1.16b,  #4
         umull           v6.8h,   v0.8b,   v29.8b
         umlal           v6.8h,   v2.8b,   v30.8b
         umlal           v6.8h,   v3.8b,   v31.8b
         umlal           v6.8h,   v4.8b,   v30.8b
         umlal           v6.8h,   v5.8b,   v29.8b
         umull2          v7.8h,   v0.16b,  v29.16b
         umlal2          v7.8h,   v2.16b,  v30.16b
         umlal2          v7.8h,   v3.16b,  v31.16b
         umlal2          v7.8h,   v4.16b,  v30.16b
         umlal2          v7.8h,   v5.16b,  v29.16b
         subs            w1,  w1,  #16
         mov             v0.16b,  v1.16b
         rshrn           v6.8b,   v6.8h,   #4
         rshrn2          v6.16b,  v7.8h,   #4
         sub             w3,  w3,  #16
         st1             {v6.16b}, [x0], #16
         b.gt            1b
         ret
 2:
         // Right padding

         // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
         movrel          x5,  padding_mask, -1
         sub             w6,  w3,  #31
         sub             x5,  x5,  w3,  uxtw
         add             x6,  x2,  w6,  sxtw

         ld1             {v2.16b, v3.16b}, [x5]    // padding_mask

         ld1r            {v28.16b}, [x6]
         bit             v0.16b,  v28.16b, v2.16b  // Pad v0-v1
         bit             v1.16b,  v28.16b, v3.16b
 4:
         // Filter one block
         ext             v2.16b,  v0.16b,  v1.16b,  #1
         ext             v3.16b,  v0.16b,  v1.16b,  #2
         ext             v4.16b,  v0.16b,  v1.16b,  #3
         ext             v5.16b,  v0.16b,  v1.16b,  #4
         umull           v6.8h,   v0.8b,   v29.8b
         umlal           v6.8h,   v2.8b,   v30.8b
         umlal           v6.8h,   v3.8b,   v31.8b
         umlal           v6.8h,   v4.8b,   v30.8b
         umlal           v6.8h,   v5.8b,   v29.8b
         umull2          v7.8h,   v0.16b,  v29.16b
         umlal2          v7.8h,   v2.16b,  v30.16b
         umlal2          v7.8h,   v3.16b,  v31.16b
         umlal2          v7.8h,   v4.16b,  v30.16b
         umlal2          v7.8h,   v5.16b,  v29.16b
         subs            w1,  w1,  #16
         mov             v0.16b,  v1.16b
         mov             v1.16b,  v28.16b
         rshrn           v6.8b,   v6.8h,   #4
         rshrn2          v6.16b,  v7.8h,   #4
         sub             w3,  w3,  #16
         st1             {v6.16b}, [x0], #16
         b.le            9f
         // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
         // filter properly once more - aka (w3 >= 0).
         cmp             w3,  #0
         b.ge            4b
 5:
         // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
         // last valid pixel - thus just output that without filtering.
         subs            w1,  w1,  #16
         st1             {v1.16b}, [x0], #16
         b.gt            5b
 9:
         ret
 endfunc

 // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const top,
 //                               const int width, const int height,
 //                               const int dx, const int max_base_x);
 function ipred_z1_fill1_8bpc_neon, export=1
         clz             w9,  w3
         adr             x8,  L(ipred_z1_fill1_tbl)
         sub             w9,  w9,  #25
         ldrh            w9,  [x8, w9, uxtw #1]
         add             x10, x2,  w6,  uxtw       // top[max_base_x]
         sub             x8,  x8,  w9,  uxtw
         ld1r            {v31.16b}, [x10]          // padding
         mov             w7,  w5
         mov             w15, #64
         br              x8
 40:
         AARCH64_VALID_JUMP_TARGET
 4:
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // xpos += dx
         cmp             w8,  w6                   // base >= max_base_x
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            49f
         ldr             d0,  [x2, w8, uxtw]       // top[base]
         ldr             d2,  [x2, w10, uxtw]
         dup             v4.4h,   w9               // frac
         dup             v5.4h,   w11
         ext             v1.8b,   v0.8b,   v0.8b,   #1 // top[base+1]
         ext             v3.8b,   v2.8b,   v2.8b,   #1
         usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
         usubl           v7.8h,   v3.8b,   v2.8b
         ushll           v16.8h,  v0.8b,   #6      // top[base]*64
         ushll           v17.8h,  v2.8b,   #6
         mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
         mla             v17.4h,  v7.4h,   v5.4h
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.s}[0], [x0], x1
         add             w7,  w7,  w5              // xpos += dx
         subs            w4,  w4,  #2
         st1             {v17.s}[0], [x0], x1
         b.gt            4b
         ret

 49:
         st1             {v31.s}[0], [x0], x1
         subs            w4,  w4,  #2
         st1             {v31.s}[0], [x0], x1
         b.gt            49b
         ret

 80:
         AARCH64_VALID_JUMP_TARGET
 8:
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // xpos += dx
         cmp             w8,  w6                   // base >= max_base_x
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            89f
         ldr             q0,  [x2, w8, uxtw]       // top[base]
         ldr             q2,  [x2, w10, uxtw]
         dup             v4.8b,   w9               // frac
         dup             v5.8b,   w11
         sub             w9,  w15, w9              // 64 - frac
         sub             w11, w15, w11
         dup             v6.8b,   w9               // 64 - frac
         dup             v7.8b,   w11
         ext             v1.16b,  v0.16b,  v0.16b,  #1 // top[base+1]
         ext             v3.16b,  v2.16b,  v2.16b,  #1
         umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
         umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
         umull           v17.8h,  v3.8b,   v5.8b
         umlal           v17.8h,  v2.8b,   v7.8b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.8b}, [x0], x1
         add             w7,  w7,  w5              // xpos += dx
         subs            w4,  w4,  #2
         st1             {v17.8b}, [x0], x1
         b.gt            8b
         ret

 89:
         st1             {v31.8b}, [x0], x1
         subs            w4,  w4,  #2
         st1             {v31.8b}, [x0], x1
         b.gt            89b
         ret

 160:
 320:
 640:
         AARCH64_VALID_JUMP_TARGET

         mov             w12, w3

         add             x13, x0,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3,  uxtw
 1:
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // xpos += dx
         cmp             w8,  w6                   // base >= max_base_x
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            169f
         add             x8,  x2,  w8,  uxtw
         add             x10, x2,  w10, uxtw
         dup             v4.16b,  w9               // frac
         dup             v5.16b,  w11
         ld1             {v0.16b, v1.16b}, [x8],  #32 // top[base]
         ld1             {v2.16b, v3.16b}, [x10], #32
         sub             w9,  w15, w9              // 64 - frac
         sub             w11, w15, w11
         dup             v6.16b,  w9               // 64 - frac
         dup             v7.16b,  w11
         add             w7,  w7,  w5              // xpos += dx
 2:
         ext             v16.16b, v0.16b,  v1.16b,  #1 // top[base+1]
         ext             v17.16b, v2.16b,  v3.16b,  #1
         subs            w3,  w3,  #16
         umull           v18.8h,  v16.8b,  v4.8b   // top[base+1]*frac
         umlal           v18.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
         umull2          v19.8h,  v16.16b, v4.16b
         umlal2          v19.8h,  v0.16b,  v6.16b
         umull           v20.8h,  v17.8b,  v5.8b
         umlal           v20.8h,  v2.8b,   v7.8b
         umull2          v21.8h,  v17.16b, v5.16b
         umlal2          v21.8h,  v2.16b,  v7.16b
         rshrn           v16.8b,  v18.8h,  #6
         rshrn2          v16.16b, v19.8h,  #6
         rshrn           v17.8b,  v20.8h,  #6
         rshrn2          v17.16b, v21.8h,  #6
         st1             {v16.16b}, [x0],  #16
         st1             {v17.16b}, [x13], #16
         b.le            3f
         mov             v0.16b,  v1.16b
         ld1             {v1.16b}, [x8],  #16 // top[base]
         mov             v2.16b,  v3.16b
         ld1             {v3.16b}, [x10], #16
         b               2b

 3:
         subs            w4,  w4,  #2
         b.le            9f
         add             x0,  x0,  x1
         add             x13, x13, x1
         mov             w3,  w12
         b               1b
 9:
         ret

 169:
         st1             {v31.16b}, [x0],  #16
         subs            w3,  w3,  #16
         st1             {v31.16b}, [x13], #16
         b.gt            169b
         subs            w4,  w4,  #2
         b.le            9b
         add             x0,  x0,  x1
         add             x13, x13, x1
         mov             w3,  w12
         b               169b

 L(ipred_z1_fill1_tbl):
         .hword L(ipred_z1_fill1_tbl) - 640b
         .hword L(ipred_z1_fill1_tbl) - 320b
         .hword L(ipred_z1_fill1_tbl) - 160b
         .hword L(ipred_z1_fill1_tbl) -  80b
         .hword L(ipred_z1_fill1_tbl) -  40b
 endfunc

 function ipred_z1_fill2_8bpc_neon, export=1
         cmp             w3,  #8
         add             x10, x2,  w6,  uxtw       // top[max_base_x]
         ld1r            {v31.16b}, [x10]          // padding
         mov             w7,  w5
         mov             w15, #64
         b.eq            8f

 4:      // w == 4
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // xpos += dx
         cmp             w8,  w6                   // base >= max_base_x
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            49f
         ldr             d0,  [x2, w8, uxtw]       // top[base]
         ldr             d2,  [x2, w10, uxtw]
         dup             v4.4h,   w9               // frac
         dup             v5.4h,   w11
         uzp2            v1.8b,   v0.8b,   v0.8b   // top[base+1]
         uzp1            v0.8b,   v0.8b,   v0.8b   // top[base]
         uzp2            v3.8b,   v2.8b,   v2.8b
         uzp1            v2.8b,   v2.8b,   v2.8b
         usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
         usubl           v7.8h,   v3.8b,   v2.8b
         ushll           v16.8h,  v0.8b,   #6      // top[base]*64
         ushll           v17.8h,  v2.8b,   #6
         mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
         mla             v17.4h,  v7.4h,   v5.4h
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.s}[0], [x0], x1
         add             w7,  w7,  w5              // xpos += dx
         subs            w4,  w4,  #2
         st1             {v17.s}[0], [x0], x1
         b.gt            4b
         ret

 49:
         st1             {v31.s}[0], [x0], x1
         subs            w4,  w4,  #2
         st1             {v31.s}[0], [x0], x1
         b.gt            49b
         ret

 8:      // w == 8
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // xpos += dx
         cmp             w8,  w6                   // base >= max_base_x
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            89f
         ldr             q0,  [x2, w8, uxtw]       // top[base]
         ldr             q2,  [x2, w10, uxtw]
         dup             v4.8b,   w9               // frac
         dup             v5.8b,   w11
         sub             w9,  w15, w9              // 64 - frac
         sub             w11, w15, w11
         dup             v6.8b,   w9               // 64 - frac
         dup             v7.8b,   w11
         uzp2            v1.16b,  v0.16b,  v0.16b  // top[base+1]
         uzp1            v0.16b,  v0.16b,  v0.16b  // top[base]
         uzp2            v3.16b,  v2.16b,  v2.16b
         uzp1            v2.16b,  v2.16b,  v2.16b
         umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
         umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
         umull           v17.8h,  v3.8b,   v5.8b
         umlal           v17.8h,  v2.8b,   v7.8b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.8b}, [x0], x1
         add             w7,  w7,  w5              // xpos += dx
         subs            w4,  w4,  #2
         st1             {v17.8b}, [x0], x1
         b.gt            8b
         ret

 89:
         st1             {v31.8b}, [x0], x1
         subs            w4,  w4,  #2
         st1             {v31.8b}, [x0], x1
         b.gt            89b
         ret
 endfunc

 // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
 //                              const int n);
 function ipred_reverse_8bpc_neon, export=1
         sub             x1,  x1,  #16
         add             x3,  x0,  #8
         mov             x4,  #16
 1:
         ld1             {v0.16b}, [x1]
         subs            w2,  w2,  #16
         rev64           v0.16b,  v0.16b
         sub             x1,  x1,  #16
         st1             {v0.d}[1], [x0], x4
         st1             {v0.d}[0], [x3], x4
         b.gt            1b
         ret
 endfunc

 const increments
         .short          0,  1,  2,  3,  4,  5,  6,  7
 endconst

 // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const left,
 //                               const int width, const int height,
 //                               const int dy, const int max_base_y);
 function ipred_z3_fill1_8bpc_neon, export=1
         cmp             w6,  #64
         clz             w9,  w3
         adr             x8,  L(ipred_z3_fill1_tbl)
         sub             w9,  w9,  #25
         ldrh            w9,  [x8, w9, uxtw #1]
         add             x10, x2,  w6,  uxtw       // left[max_base_y]
         sub             x8,  x8,  w9,  uxtw
         movrel          x11, increments
         ld1r            {v31.16b}, [x10]          // padding
         ld1             {v30.8h},  [x11]          // increments
         mov             w7,  w5
         b.gt            L(ipred_z3_fill1_large_w16)
         br              x8

 40:
         AARCH64_VALID_JUMP_TARGET
         dup             v29.4h,  w5               // dy

         mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         ld1             {v0.16b, v1.16b}, [x2] // left[]
         add             v30.4h,  v29.4h,  v30.4h  // ypos

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

         xtn             v24.8b,  v30.8h           // (uint8_t)ypos
         uqshrn          v26.8b,  v30.8h,  #6      // base
         and             v24.8b,  v24.8b,  v23.8b  // frac

         mov             v4.8b,   v31.8b
         uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
         uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
         sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac

         tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]

         trn1            v27.2s,  v27.2s,  v28.2s  // base + 1, base + 2
         trn1            v24.2s,  v24.2s,  v24.2s  // frac
         trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
 1:
         mov             v5.8b,   v31.8b
         tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]

         trn1            v4.2s,   v4.2s,   v5.2s   // left[base], left[base+1]

         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         rshrn           v16.8b,  v16.8h,  #6
         st1             {v16.s}[0], [x0], x1
         subs            w4,  w4,  #2
         st1             {v16.s}[1], [x0], x1
         b.le            9f

         ext             v4.8b,   v5.8b,   v5.8b,  #4
         uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
         b               1b

 9:
         ret

 80:
         AARCH64_VALID_JUMP_TARGET
         dup             v29.8h,  w5               // dy

         mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
         add             v30.8h,  v29.8h,  v30.8h  // ypos

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

         xtn             v24.8b,  v30.8h           // (uint8_t)ypos
         uqshrn          v26.8b,  v30.8h,  #6      // base
         and             v24.8b,  v24.8b,  v23.8b  // frac

         mov             v4.8b,   v31.8b
         uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
         uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
         sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac

         tbx             v4.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.8b // left[base]
 1:
         mov             v5.8b,   v31.8b
         mov             v6.8b,   v31.8b
         tbx             v5.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.8b // left[base+1]
         tbx             v6.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.8b // left[base+2]

         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         umull           v17.8h,  v5.8b,   v25.8b
         umlal           v17.8h,  v6.8b,   v24.8b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.8b}, [x0], x1
         subs            w4,  w4,  #2
         st1             {v17.8b}, [x0], x1
         b.le            9f

         mov             v4.8b,   v6.8b
         uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
         uqadd           v28.8b,  v28.8b,  v21.8b  // base += 2
         b               1b

 9:
         ret

 160:
         AARCH64_VALID_JUMP_TARGET
         dup             v28.8h,  w5               // dy

         shl             v29.8h,  v28.8h,  #3      // 8*dy
         mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
         add             v28.8h,  v28.8h,  v30.8h  // ypos

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

         add             v29.8h,  v28.8h,  v29.8h  // ypos + 8*dy

         xtn             v24.8b,  v28.8h           // (uint8_t)ypos
         xtn2            v24.16b, v29.8h
         uqshrn          v26.8b,  v28.8h,  #6      // base
         uqshrn2         v26.16b, v29.8h,  #6
         and             v24.16b, v24.16b, v23.16b // frac

         mov             v4.16b,  v31.16b
         uqadd           v27.16b, v26.16b, v20.16b // base + 1
         uqadd           v28.16b, v26.16b, v21.16b // base + 2
         sub             v25.16b, v22.16b, v24.16b // 64 - frac

         tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
 1:
         mov             v5.16b,  v31.16b
         mov             v6.16b,  v31.16b
         tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
         tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]

         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         umull2          v17.8h,  v4.16b,  v25.16b
         umlal2          v17.8h,  v5.16b,  v24.16b
         umull           v18.8h,  v5.8b,   v25.8b
         umlal           v18.8h,  v6.8b,   v24.8b
         umull2          v19.8h,  v5.16b,  v25.16b
         umlal2          v19.8h,  v6.16b,  v24.16b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn2          v16.16b, v17.8h,  #6
         rshrn           v17.8b,  v18.8h,  #6
         rshrn2          v17.16b, v19.8h,  #6
         st1             {v16.16b}, [x0], x1
         subs            w4,  w4,  #2
         st1             {v17.16b}, [x0], x1
         b.le            9f

         mov             v4.16b,  v6.16b
         uqadd           v27.16b, v27.16b, v21.16b // base += 2
         uqadd           v28.16b, v28.16b, v21.16b // base += 2
         b               1b

 9:
         ret
 320:
 640:
         AARCH64_VALID_JUMP_TARGET
         dup             v28.8h,  w5               // dy
         mov             w12, w3

         add             x13, x0,  x1

         shl             v29.8h,  v28.8h,  #3      // 8*dy
         mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3,  uxtw
         add             v30.8h,  v28.8h,  v30.8h  // ypos

         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

 1:
         mov             v26.16b,  v30.16b         // reset ypos

 2:
         add             v27.8h,  v26.8h,  v29.8h  // ypos + 8*dy
         uqshrn          v16.8b,  v26.8h,  #6      // base
         uqshrn2         v16.16b, v27.8h,  #6
         xtn             v24.8b,  v26.8h           // (uint8_t)ypos
         xtn2            v24.16b, v27.8h
         umov            w14,     v16.b[0]
         and             v24.16b, v24.16b, v23.16b // frac

         uqadd           v17.16b, v16.16b, v20.16b // base + 1
         cmp             w14, w6                   // base >= max_base_y
         uqadd           v18.16b, v16.16b, v21.16b // base + 2
         sub             v25.16b, v22.16b, v24.16b // 64 - frac

         b.ge            4f

         mov             v4.16b,  v31.16b
         mov             v5.16b,  v31.16b
         mov             v6.16b,  v31.16b
         tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
         tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
         tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]

         subs            w3,  w3,  #16
         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         umull2          v17.8h,  v4.16b,  v25.16b
         umlal2          v17.8h,  v5.16b,  v24.16b
         umull           v18.8h,  v5.8b,   v25.8b
         umlal           v18.8h,  v6.8b,   v24.8b
         umull2          v19.8h,  v5.16b,  v25.16b
         umlal2          v19.8h,  v6.16b,  v24.16b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn2          v16.16b, v17.8h,  #6
         rshrn           v17.8b,  v18.8h,  #6
         rshrn2          v17.16b, v19.8h,  #6
         st1             {v16.16b}, [x0],  #16
         st1             {v17.16b}, [x13], #16
         b.le            3f
         add             v26.8h,  v27.8h,  v29.8h  // ypos += 16*dy
         b               2b

 3:
         subs            w4,  w4,  #2
         b.le            9f
         movi            v16.8h,  #128
         add             x0,  x0,  x1
         add             x13, x13, x1
         add             v30.8h,  v30.8h,  v16.8h  // ypos = dy + y*(1<<6)*2
         mov             w3,  w12
         b               1b

 4:
         subs            w3,  w3,  #16
         st1             {v31.16b}, [x0],  #16
         st1             {v31.16b}, [x13], #16
         b.gt            4b
         b               3b

 9:
         ret

 L(ipred_z3_fill1_large_w16):
         // Fallback case for max_base_y > 64; similar to the z1
         // implementation. This does the filtering vertically, filling out
         // a 2x pixel column at a time.
         mov             w15, #64
         add             x13, x0,  x1
         lsl             x1,  x1,  #1

         mov             w12, w4
 1:
         lsr             w8,  w7,  #6              // base
         and             w9,  w7,  #0x3e           // frac
         add             w7,  w7,  w5              // ypos += dy
         cmp             w8,  w6                   // base >= max_base_y
         lsr             w10, w7,  #6              // base
         and             w11, w7,  #0x3e           // frac
         b.ge            ipred_z3_fill_padding_neon
         add             x8,  x2,  w8,  uxtw
         add             x10, x2,  w10, uxtw
         dup             v4.16b,  w9               // frac
         dup             v5.16b,  w11
         ld1             {v0.16b, v1.16b}, [x8],  #32 // left[base]
         ld1             {v2.16b, v3.16b}, [x10], #32
         sub             w9,  w15, w9              // 64 - frac
         sub             w11, w15, w11
         dup             v6.16b,  w9               // 64 - frac
         dup             v7.16b,  w11
         add             w7,  w7,  w5              // ypos += dy
 2:
         ext             v16.16b, v0.16b,  v1.16b,  #1 // left[base+1]
         ext             v17.16b, v2.16b,  v3.16b,  #1
         subs            w4,  w4,  #16
         umull           v18.8h,  v16.8b,  v4.8b   // left[base+1]*frac
         umlal           v18.8h,  v0.8b,   v6.8b   // + left[base]*(64-frac)
         umull2          v19.8h,  v16.16b, v4.16b
         umlal2          v19.8h,  v0.16b,  v6.16b
         umull           v20.8h,  v17.8b,  v5.8b
         umlal           v20.8h,  v2.8b,   v7.8b
         umull2          v21.8h,  v17.16b, v5.16b
         umlal2          v21.8h,  v2.16b,  v7.16b
         rshrn           v16.8b,  v18.8h,  #6
         rshrn2          v16.16b, v19.8h,  #6
         rshrn           v17.8b,  v20.8h,  #6
         rshrn2          v17.16b, v21.8h,  #6
         zip1            v18.16b, v16.16b, v17.16b
         zip2            v19.16b, v16.16b, v17.16b
         st1             {v18.h}[0], [x0],  x1
         st1             {v18.h}[1], [x13], x1
         st1             {v18.h}[2], [x0],  x1
         st1             {v18.h}[3], [x13], x1
         st1             {v18.h}[4], [x0],  x1
         st1             {v18.h}[5], [x13], x1
         st1             {v18.h}[6], [x0],  x1
         st1             {v18.h}[7], [x13], x1
         st1             {v19.h}[0], [x0],  x1
         st1             {v19.h}[1], [x13], x1
         st1             {v19.h}[2], [x0],  x1
         st1             {v19.h}[3], [x13], x1
         st1             {v19.h}[4], [x0],  x1
         st1             {v19.h}[5], [x13], x1
         st1             {v19.h}[6], [x0],  x1
         st1             {v19.h}[7], [x13], x1
         b.le            3f
         mov             v0.16b,  v1.16b
         ld1             {v1.16b}, [x8],  #16      // left[base]
         mov             v2.16b,  v3.16b
         ld1             {v3.16b}, [x10], #16
         b               2b

 3:
         subs            w3,  w3,  #2
         b.le            9f
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
         msub            x13, x1,  x12, x13
         lsl             x1,  x1,  #1
         add             x0,  x0,  #2
         add             x13, x13, #2
         mov             w4,  w12
         b               1b
 9:
         ret

 L(ipred_z3_fill1_tbl):
         .hword L(ipred_z3_fill1_tbl) - 640b
         .hword L(ipred_z3_fill1_tbl) - 320b
         .hword L(ipred_z3_fill1_tbl) - 160b
         .hword L(ipred_z3_fill1_tbl) -  80b
         .hword L(ipred_z3_fill1_tbl) -  40b
 endfunc

 function ipred_z3_fill_padding_neon, export=0
         cmp             w3,  #16
         adr             x8,  L(ipred_z3_fill_padding_tbl)
         b.gt            L(ipred_z3_fill_padding_wide)
         // w3 = remaining width, w4 = constant height
         mov             w12, w4

 1:
         // Fill a WxH rectangle with padding. W can be any number;
         // this fills the exact width by filling in the largest
         // power of two in the remaining width, and repeating.
         clz             w9,  w3
         sub             w9,  w9,  #25
         ldrh            w9,  [x8, w9, uxtw #1]
         sub             x9,  x8,  w9,  uxtw
         br              x9

 2:
         st1             {v31.h}[0], [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.h}[0], [x13], x1
         st1             {v31.h}[0], [x0],  x1
         st1             {v31.h}[0], [x13], x1
         b.gt            2b
         subs            w3,  w3,  #2
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
         msub            x13, x1,  x12, x13
         b.le            9f
         lsl             x1,  x1,  #1
         add             x0,  x0,  #2
         add             x13, x13, #2
         mov             w4,  w12
         b               1b

 4:
         st1             {v31.s}[0], [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.s}[0], [x13], x1
         st1             {v31.s}[0], [x0],  x1
         st1             {v31.s}[0], [x13], x1
         b.gt            4b
         subs            w3,  w3,  #4
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
         msub            x13, x1,  x12, x13
         b.le            9f
         lsl             x1,  x1,  #1
         add             x0,  x0,  #4
         add             x13, x13, #4
         mov             w4,  w12
         b               1b

 8:
         st1             {v31.8b}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.8b}, [x13], x1
         st1             {v31.8b}, [x0],  x1
         st1             {v31.8b}, [x13], x1
         b.gt            4b
         subs            w3,  w3,  #8
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
         msub            x13, x1,  x12, x13
         b.le            9f
         lsl             x1,  x1,  #1
         add             x0,  x0,  #8
         add             x13, x13, #8
         mov             w4,  w12
         b               1b

 16:
 32:
 64:
         st1             {v31.16b}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.16b}, [x13], x1
         st1             {v31.16b}, [x0],  x1
         st1             {v31.16b}, [x13], x1
         b.gt            4b
         subs            w3,  w3,  #16
         lsr             x1,  x1,  #1
         msub            x0,  x1,  x12, x0         // ptr -= h * stride
         msub            x13, x1,  x12, x13
         b.le            9f
         lsl             x1,  x1,  #1
         add             x0,  x0,  #16
         add             x13, x13, #16
         mov             w4,  w12
         b               1b

 9:
         ret

 L(ipred_z3_fill_padding_tbl):
         .hword L(ipred_z3_fill_padding_tbl) - 64b
         .hword L(ipred_z3_fill_padding_tbl) - 32b
         .hword L(ipred_z3_fill_padding_tbl) - 16b
         .hword L(ipred_z3_fill_padding_tbl) -  8b
         .hword L(ipred_z3_fill_padding_tbl) -  4b
         .hword L(ipred_z3_fill_padding_tbl) -  2b

 L(ipred_z3_fill_padding_wide):
         // Fill a WxH rectangle with padding, with W > 16.
         lsr             x1,  x1,  #1
         mov             w12, w3
         sub             x1,  x1,  w3,  uxtw
 1:
         ands            w5,  w3,  #15
         b.eq            2f
         // If the width isn't aligned to 16, first do one 16 byte write
         // and align the start pointer.
         sub             w3,  w3,  w5
         st1             {v31.16b}, [x0]
         add             x0,  x0,  w5,  uxtw
 2:
         // Fill the rest of the line with aligned 16 byte writes.
         subs            w3,  w3,  #16
         st1             {v31.16b}, [x0], #16
         b.gt            2b
         subs            w4,  w4,  #1
         add             x0,  x0, x1
         b.le            9f
         mov             w3,  w12
         b               1b
 9:
         ret
 endfunc

 function ipred_z3_fill2_8bpc_neon, export=1
         adr             x8,  L(ipred_z3_fill1_tbl)
         add             x10, x2,  w6,  uxtw       // left[max_base_y]
         movrel          x11, increments
         ld1r            {v31.16b}, [x10]          // padding
         ld1             {v30.8h},  [x11]          // increments
         mov             w7,  w5

         cmp             w3,  #8
         add             x10, x2,  w6,  uxtw       // left[max_base_y]
         ld1r            {v31.16b}, [x10]          // padding
         b.eq            80f

 40:     // w == 4
         dup             v29.4h,  w5               // dy

         mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         ld1             {v0.16b, v1.16b}, [x2] // left[]
         add             v30.4h,  v29.4h,  v30.4h  // ypos

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

         xtn             v24.8b,  v30.8h           // (uint8_t)ypos
         uqshrn          v26.8b,  v30.8h,  #6      // base
         and             v24.8b,  v24.8b,  v23.8b  // frac

         uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
         uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
         sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
         uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3

         trn1            v24.2s,  v24.2s,  v24.2s  // frac
         trn1            v26.2s,  v26.2s,  v28.2s  // base + 0, base + 2
         trn1            v27.2s,  v27.2s,  v29.2s  // base + 1, base + 3
         trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac

         movi            v21.16b, #4
 1:
         mov             v4.8b,   v31.8b
         mov             v5.8b,   v31.8b
         tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
         tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]

         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         rshrn           v16.8b,  v16.8h,  #6
         st1             {v16.s}[0], [x0], x1
         subs            w4,  w4,  #2
         st1             {v16.s}[1], [x0], x1
         b.le            9f

         uqadd           v26.8b,  v26.8b,  v21.8b  // base += 4
         uqadd           v27.8b,  v27.8b,  v21.8b  // base += 4
         b               1b

 9:
         ret

 80:     // w == 8
         dup             v29.8h,  w5               // dy

         mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
         movi            v23.16b, #0x3e

         ld1             {v0.16b, v1.16b}, [x2] // left[]
         add             v30.8h,  v29.8h,  v30.8h  // ypos

         movi            v22.16b, #64
         movi            v20.16b, #1
         movi            v21.16b, #2

         xtn             v24.8b,  v30.8h           // (uint8_t)ypos
         uqshrn          v26.8b,  v30.8h,  #6      // base
         and             v24.8b,  v24.8b,  v23.8b  // frac

         uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
         uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
         sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
         uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3

         trn1            v24.2d,  v24.2d,  v24.2d  // frac
         trn1            v26.2d,  v26.2d,  v28.2d  // base + 0, base + 2
         trn1            v27.2d,  v27.2d,  v29.2d  // base + 1, base + 3
         trn1            v25.2d,  v25.2d,  v25.2d  // 64 - frac

         movi            v21.16b, #4
 1:
         mov             v4.16b,  v31.16b
         mov             v5.16b,  v31.16b
         tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
         tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]

         umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
         umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
         umull2          v17.8h,  v4.16b,  v25.16b
         umlal2          v17.8h,  v5.16b,  v24.16b
         rshrn           v16.8b,  v16.8h,  #6
         rshrn           v17.8b,  v17.8h,  #6
         st1             {v16.8b}, [x0], x1
         subs            w4,  w4,  #2
         st1             {v17.8b}, [x0], x1
         b.le            9f

         uqadd           v26.16b, v26.16b, v21.16b // base += 4
         uqadd           v27.16b, v27.16b, v21.16b // base += 4
         b               1b

 9:
         ret
 endfunc


 // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                             const pixel *const topleft,
 //                             const int width, const int height, const int filt_idx,
 //                             const int max_width, const int max_height);
 function ipred_filter_8bpc_neon, export=1
         and             w5,  w5,  #511
         movrel          x6,  X(filter_intra_taps)
         lsl             w5,  w5,  #6
         add             x6,  x6,  w5, uxtw
         ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
         clz             w9,  w3
         adr             x5,  L(ipred_filter_tbl)
         ld1             {v20.8b, v21.8b, v22.8b}, [x6]
         sub             w9,  w9,  #26
         ldrh            w9,  [x5, w9, uxtw #1]
         sxtl            v16.8h,  v16.8b
         sxtl            v17.8h,  v17.8b
         sub             x5,  x5,  w9, uxtw
         sxtl            v18.8h,  v18.8b
         sxtl            v19.8h,  v19.8b
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         sxtl            v20.8h,  v20.8b
         sxtl            v21.8h,  v21.8b
         sxtl            v22.8h,  v22.8b
         br              x5
 40:
         AARCH64_VALID_JUMP_TARGET
         ldur            s0,  [x2, #1]             // top (0-3)
         sub             x2,  x2,  #2
         mov             x7,  #-2
         uxtl            v0.8h,   v0.8b            // top (0-3)
 4:
         ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
         mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
         uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
         mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
         mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
         mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
         mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
         sqrshrun        v2.8b,   v2.8h,   #4
         subs            w4,  w4,  #2
         st1             {v2.s}[0], [x0], x1
         uxtl            v0.8h,   v2.8b
         st1             {v2.s}[1], [x6], x1
         ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
         ldur            d0,  [x2, #1]             // top (0-7)
         sub             x2,  x2,  #2
         mov             x7,  #-2
         uxtl            v0.8h,   v0.8b            // top (0-7)
 8:
         ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
         mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
         uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
         mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
         mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
         mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
         mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
         mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
         mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
         mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
         sqrshrun        v2.8b,   v2.8h,   #4
         uxtl            v1.8h,   v2.8b            // first block, in 16 bit
         mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
         mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
         mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
         mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
         sqrshrun        v3.8b,   v3.8h,   #4
         subs            w4,  w4,  #2
         st2             {v2.s, v3.s}[0], [x0], x1
         zip2            v0.2s,   v2.2s,   v3.2s
         st2             {v2.s, v3.s}[1], [x6], x1
         uxtl            v0.8h,   v0.8b
         b.gt            8b
         ret
 160:
 320:
         AARCH64_VALID_JUMP_TARGET
         add             x8,  x2,  #1
         sub             x2,  x2,  #2
         mov             x7,  #-2
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3

 1:
         ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
         uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
 2:
         ld1             {v2.16b}, [x8],   #16     // top(0-15)
         mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
         mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
         uxtl            v1.8h,   v2.8b            // top(0-7)
         uxtl2           v2.8h,   v2.16b           // top(8-15)
         mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
         mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
         mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
         mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
         mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

         mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
         mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
         mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
         sqrshrun        v3.8b,   v3.8h,   #4
         uxtl            v0.8h,   v3.8b            // first block, in 16 bit
         mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
         mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
         mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
         mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

         mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
         mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
         mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
         sqrshrun        v4.8b,   v4.8h,   #4
         uxtl            v0.8h,   v4.8b            // second block, in 16 bit
         mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
         mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
         mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
         mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

         mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
         mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
         mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
         sqrshrun        v5.8b,   v5.8h,   #4
         uxtl            v0.8h,   v5.8b            // third block, in 16 bit
         mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
         mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
         mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
         mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)

         subs            w3,  w3,  #16
         sqrshrun        v6.8b,   v6.8h,   #4

         st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
         st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
         b.le            8f
         ins             v0.h[2], v2.h[7]
         ins             v0.b[0], v6.b[7]
         ins             v0.b[2], v6.b[3]
         b               2b
 8:
         subs            w4,  w4,  #2
         b.le            9f
         sub             x8,  x6,  w9, uxtw
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         mov             w3,  w9
         b               1b
 9:
         ret

 L(ipred_filter_tbl):
         .hword L(ipred_filter_tbl) - 320b
         .hword L(ipred_filter_tbl) - 160b
         .hword L(ipred_filter_tbl) -  80b
         .hword L(ipred_filter_tbl) -  40b
 endfunc

 // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                         const uint16_t *const pal, const uint8_t *idx,
 //                         const int w, const int h);
 function pal_pred_8bpc_neon, export=1
         ld1             {v0.8h}, [x2]
         clz             w9,  w4
         adr             x6,  L(pal_pred_tbl)
         sub             w9,  w9,  #25
         ldrh            w9,  [x6, w9, uxtw #1]
         xtn             v0.8b,  v0.8h
         sub             x6,  x6,  w9, uxtw
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
         br              x6
 4:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b}, [x3], #16
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         st1             {v1.s}[0], [x0], x1
         st1             {v1.s}[1], [x2], x1
         st1             {v1.s}[2], [x0], x1
         st1             {v1.s}[3], [x2], x1
         b.gt            4b
         ret
 8:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b}, [x3], #32
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         st1             {v1.d}[0], [x0], x1
         tbl             v2.16b, {v0.16b}, v2.16b
         st1             {v1.d}[1], [x2], x1
         st1             {v2.d}[0], [x0], x1
         st1             {v2.d}[1], [x2], x1
         b.gt            8b
         ret
 16:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         tbl             v2.16b, {v0.16b}, v2.16b
         st1             {v1.16b}, [x0], x1
         tbl             v3.16b, {v0.16b}, v3.16b
         st1             {v2.16b}, [x2], x1
         tbl             v4.16b, {v0.16b}, v4.16b
         st1             {v3.16b}, [x0], x1
         st1             {v4.16b}, [x2], x1
         b.gt            16b
         ret
 32:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
         subs            w5,  w5,  #4
         tbl             v16.16b, {v0.16b}, v16.16b
         tbl             v17.16b, {v0.16b}, v17.16b
         tbl             v18.16b, {v0.16b}, v18.16b
         tbl             v19.16b, {v0.16b}, v19.16b
         tbl             v20.16b, {v0.16b}, v20.16b
         st1             {v16.16b, v17.16b}, [x0], x1
         tbl             v21.16b, {v0.16b}, v21.16b
         st1             {v18.16b, v19.16b}, [x2], x1
         tbl             v22.16b, {v0.16b}, v22.16b
         st1             {v20.16b, v21.16b}, [x0], x1
         tbl             v23.16b, {v0.16b}, v23.16b
         st1             {v22.16b, v23.16b}, [x2], x1
         b.gt            32b
         ret
 64:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
         subs            w5,  w5,  #2
         tbl             v16.16b, {v0.16b}, v16.16b
         tbl             v17.16b, {v0.16b}, v17.16b
         tbl             v18.16b, {v0.16b}, v18.16b
         tbl             v19.16b, {v0.16b}, v19.16b
         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
         tbl             v20.16b, {v0.16b}, v20.16b
         tbl             v21.16b, {v0.16b}, v21.16b
         tbl             v22.16b, {v0.16b}, v22.16b
         tbl             v23.16b, {v0.16b}, v23.16b
         st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
         b.gt            64b
         ret

 L(pal_pred_tbl):
         .hword L(pal_pred_tbl) - 64b
         .hword L(pal_pred_tbl) - 32b
         .hword L(pal_pred_tbl) - 16b
         .hword L(pal_pred_tbl) -  8b
         .hword L(pal_pred_tbl) -  4b
 endfunc

 // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height,
 //                              const int16_t *ac, const int alpha);
 function ipred_cfl_128_8bpc_neon, export=1
         clz             w9,  w3
         adr             x7,  L(ipred_cfl_128_tbl)
         sub             w9,  w9,  #26
         ldrh            w9,  [x7, w9, uxtw #1]
         movi            v0.8h,   #128 // dc
         dup             v1.8h,   w6   // alpha
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 L(ipred_cfl_splat_w4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x5], #32
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
         cmlt            v4.8h,   v2.8h,   #0     // sign
         cmlt            v5.8h,   v3.8h,   #0
         add             v2.8h,   v2.8h,   v4.8h  // diff + sign
         add             v3.8h,   v3.8h,   v5.8h
         srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
         srshr           v3.8h,   v3.8h,   #6
         add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
         add             v3.8h,   v3.8h,   v0.8h
         sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
         sqxtun          v3.8b,   v3.8h
         st1             {v2.s}[0],  [x0], x1
         st1             {v2.s}[1],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v3.s}[0],  [x0], x1
         st1             {v3.s}[1],  [x6], x1
         b.gt            L(ipred_cfl_splat_w4)
         ret
 L(ipred_cfl_splat_w8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
         mul             v4.8h,   v4.8h,   v1.8h
         mul             v5.8h,   v5.8h,   v1.8h
         cmlt            v16.8h,  v2.8h,   #0     // sign
         cmlt            v17.8h,  v3.8h,   #0
         cmlt            v18.8h,  v4.8h,   #0
         cmlt            v19.8h,  v5.8h,   #0
         add             v2.8h,   v2.8h,   v16.8h // diff + sign
         add             v3.8h,   v3.8h,   v17.8h
         add             v4.8h,   v4.8h,   v18.8h
         add             v5.8h,   v5.8h,   v19.8h
         srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
         srshr           v3.8h,   v3.8h,   #6
         srshr           v4.8h,   v4.8h,   #6
         srshr           v5.8h,   v5.8h,   #6
         add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
         add             v3.8h,   v3.8h,   v0.8h
         add             v4.8h,   v4.8h,   v0.8h
         add             v5.8h,   v5.8h,   v0.8h
         sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
         sqxtun          v3.8b,   v3.8h
         sqxtun          v4.8b,   v4.8h
         sqxtun          v5.8b,   v5.8h
         st1             {v2.8b},  [x0], x1
         st1             {v3.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v4.8b},  [x0], x1
         st1             {v5.8b},  [x6], x1
         b.gt            L(ipred_cfl_splat_w8)
         ret
 L(ipred_cfl_splat_w16):
         AARCH64_VALID_JUMP_TARGET
         add             x7,  x5,  w3, uxtw #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
 1:
         ld1             {v2.8h, v3.8h}, [x5], #32
         ld1             {v4.8h, v5.8h}, [x7], #32
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
         mul             v4.8h,   v4.8h,   v1.8h
         mul             v5.8h,   v5.8h,   v1.8h
         cmlt            v16.8h,  v2.8h,   #0     // sign
         cmlt            v17.8h,  v3.8h,   #0
         cmlt            v18.8h,  v4.8h,   #0
         cmlt            v19.8h,  v5.8h,   #0
         add             v2.8h,   v2.8h,   v16.8h // diff + sign
         add             v3.8h,   v3.8h,   v17.8h
         add             v4.8h,   v4.8h,   v18.8h
         add             v5.8h,   v5.8h,   v19.8h
         srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
         srshr           v3.8h,   v3.8h,   #6
         srshr           v4.8h,   v4.8h,   #6
         srshr           v5.8h,   v5.8h,   #6
         add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
         add             v3.8h,   v3.8h,   v0.8h
         add             v4.8h,   v4.8h,   v0.8h
         add             v5.8h,   v5.8h,   v0.8h
         sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
         sqxtun          v3.8b,   v3.8h
         sqxtun          v4.8b,   v4.8h
         sqxtun          v5.8b,   v5.8h
         subs            w3,  w3,  #16
         st1             {v2.8b, v3.8b},  [x0], #16
         st1             {v4.8b, v5.8b},  [x6], #16
         b.gt            1b
         subs            w4,  w4,  #2
         add             x5,  x5,  w9, uxtw #1
         add             x7,  x7,  w9, uxtw #1
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         mov             w3,  w9
         b.gt            1b
         ret

 L(ipred_cfl_128_tbl):
 L(ipred_cfl_splat_tbl):
         .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
         .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
         .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
         .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
 endfunc

 // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                              const pixel *const topleft,
 //                              const int width, const int height,
 //                              const int16_t *ac, const int alpha);
 function ipred_cfl_top_8bpc_neon, export=1
         clz             w9,  w3
         adr             x7,  L(ipred_cfl_top_tbl)
         sub             w9,  w9,  #26
         ldrh            w9,  [x7, w9, uxtw #1]
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #1
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 4:
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 8:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 16:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 32:
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         b               L(ipred_cfl_splat_w16)

 L(ipred_cfl_top_tbl):
         .hword L(ipred_cfl_top_tbl) - 32b
         .hword L(ipred_cfl_top_tbl) - 16b
         .hword L(ipred_cfl_top_tbl) -  8b
         .hword L(ipred_cfl_top_tbl) -  4b
 endfunc

 // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const topleft,
 //                               const int width, const int height,
 //                               const int16_t *ac, const int alpha);
 function ipred_cfl_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w9,  w3
         clz             w8,  w4
         adr             x10, L(ipred_cfl_splat_tbl)
         adr             x7,  L(ipred_cfl_left_tbl)
         sub             w9,  w9,  #26
         sub             w8,  w8,  #26
         ldrh            w9,  [x10, w9, uxtw #1]
         ldrh            w8,  [x7,  w8, uxtw #1]
         dup             v1.8h,   w6   // alpha
         sub             x9,  x10, w9, uxtw
         sub             x7,  x7,  w8, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7

 L(ipred_cfl_left_h4):
         AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9

 L(ipred_cfl_left_h8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9

 L(ipred_cfl_left_h16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         br              x9

 L(ipred_cfl_left_h32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         br              x9

 L(ipred_cfl_left_tbl):
         .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
         .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
         .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
         .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
 endfunc

 // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                          const pixel *const topleft,
 //                          const int width, const int height,
 //                          const int16_t *ac, const int alpha);
 function ipred_cfl_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         add             w8,  w3,  w4             // width + height
         dup             v1.8h,   w6              // alpha
         clz             w9,  w3
         clz             w6,  w4
         dup             v16.8h, w8               // width + height
         adr             x7,  L(ipred_cfl_tbl)
         rbit            w8,  w8                  // rbit(width + height)
         sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
         ldrh            w9,  [x7, w9, uxtw #1]
         ldrh            w6,  [x7, w6, uxtw #1]
         neg             w8,  w8                  // -ctz(width + height)
         sub             x9,  x7,  w9, uxtw
         sub             x7,  x7,  w6, uxtw
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7

 L(ipred_cfl_h4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2], #4
         ins             v0.s[1], wzr
         add             x2,  x2,  #1
         uaddlv          h0,      v0.8b
         br              x9
 L(ipred_cfl_w4):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.s}[0],  [x2]
         ins             v2.s[1], wzr
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.8b
         cmp             w4,  #4
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16
         mov             w16, #(0x3334/2)
         movk            w16, #(0x5556/2), lsl #16
         add             w17, w4,  w4  // w17 = 2*h = 16 or 32
         lsr             w16, w16, w17
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)

 L(ipred_cfl_h8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2], #8
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x9
 L(ipred_cfl_w8):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8b},  [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.8b
         cmp             w4,  #8
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/16/32
         cmp             w4,  #32
         mov             w16, #(0x3334/2)
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)

 L(ipred_cfl_h16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2], #16
         uaddlv          h0,      v0.16b
         add             x2,  x2,  #1
         br              x9
 L(ipred_cfl_w16):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.16b
         cmp             w4,  #16
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/8/32
         cmp             w4,  #4
         mov             w16, #(0x3334/2)
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)

 L(ipred_cfl_h32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2], #32
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             x2,  x2,  #1
         add             v0.4h,   v2.4h,   v3.4h
         br              x9
 L(ipred_cfl_w32):
         AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v2.4h
         add             v0.4h,   v0.4h,   v3.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16
         mov             w16, #(0x5556/2)
         movk            w16, #(0x3334/2), lsl #16
         add             w17, w4,  w4  // w17 = 2*h = 16 or 32
         lsr             w16, w16, w17
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)

 L(ipred_cfl_tbl):
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
 endfunc

 // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_420_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
         adr             x7,  L(ipred_cfl_ac_420_tbl)
         sub             w8,  w8,  #27
         ldrh            w8,  [x7, w8, uxtw #1]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
         sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
         clz             w9,  w9              // ctz(width)
         clz             w10, w10             // ctz(height)
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7

 L(ipred_cfl_ac_420_w4):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8b},   [x1],  x2
         ld1             {v1.8b},   [x10], x2
         ld1             {v0.d}[1], [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         add             v0.8h,   v0.8h,   v1.8h
         shl             v0.8h,   v0.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h}, [x0], #16
         add             v16.8h,  v16.8h,  v0.8h
         b.gt            1b
         trn2            v1.2d,   v0.2d,   v0.2d
         trn2            v0.2d,   v0.2d,   v0.2d
 L(ipred_cfl_ac_420_w4_hpad):
         cbz             w4,  3f
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], #32
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         b.gt            2b
 3:
         // Aggregate the sums
         add             v0.8h,   v16.8h,  v17.8h
         uaddlv          s0,  v0.8h                // sum
         sub             x0,  x0,  w6, uxtw #3
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h}, [x0]
         subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            6b
         ret

 L(ipred_cfl_ac_420_w8):
         AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v1.16b}, [x10], x2
         ld1             {v2.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v3.16b}, [x10], x2
         uaddlp          v1.8h,   v1.16b
         uaddlp          v2.8h,   v2.16b
         uaddlp          v3.8h,   v3.16b
         add             v0.8h,   v0.8h,   v1.8h
         add             v2.8h,   v2.8h,   v3.8h
         shl             v0.8h,   v0.8h,   #1
         shl             v1.8h,   v2.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h}, [x0], #32
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         b.gt            1b
         mov             v0.16b,  v1.16b
         b               L(ipred_cfl_ac_420_w8_hpad)

 L(ipred_cfl_ac_420_w8_wpad):
 1:      // Copy and subsample input, padding 4
         ld1             {v0.8b},   [x1],  x2
         ld1             {v1.8b},   [x10], x2
         ld1             {v0.d}[1], [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         add             v0.8h,   v0.8h,   v1.8h
         shl             v0.8h,   v0.8h,   #1
         dup             v1.4h,   v0.h[3]
         dup             v3.4h,   v0.h[7]
         trn2            v2.2d,   v0.2d,   v0.2d
         subs            w8,  w8,  #2
         st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
         add             v16.4h,  v16.4h,  v0.4h
         add             v17.4h,  v17.4h,  v1.4h
         add             v18.4h,  v18.4h,  v2.4h
         add             v19.4h,  v19.4h,  v3.4h
         b.gt            1b
         trn1            v0.2d,   v2.2d,   v3.2d
         trn1            v1.2d,   v2.2d,   v3.2d

 L(ipred_cfl_ac_420_w8_hpad):
         cbz             w4,  3f
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], #32
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         add             v18.8h,  v18.8h,  v0.8h
         add             v19.8h,  v19.8h,  v1.8h
         b.gt            2b
 3:

 L(ipred_cfl_ac_420_w8_calc_subtract_dc):
         // Aggregate the sums
         add             v0.8h,   v16.8h,  v17.8h
         add             v2.8h,   v18.8h,  v19.8h
         uaddlp          v0.4s,   v0.8h
         uaddlp          v2.4s,   v2.8h
         add             v0.4s,   v0.4s,   v2.4s
         addv            s0,  v0.4s                // sum
         sub             x0,  x0,  w6, uxtw #4
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
 L(ipred_cfl_ac_420_w8_subtract_dc):
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
         subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         sub             v2.8h,   v2.8h,   v4.8h
         sub             v3.8h,   v3.8h,   v4.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            6b
         ret

 L(ipred_cfl_ac_420_w16):
         AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7

 L(ipred_cfl_ac_420_w16_wpad0):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b, v1.16b}, [x1],  x2
         ld1             {v2.16b, v3.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v4.16b, v5.16b}, [x1],  x2
         uaddlp          v1.8h,   v1.16b
         ld1             {v6.16b, v7.16b}, [x10], x2
         uaddlp          v2.8h,   v2.16b
         uaddlp          v3.8h,   v3.16b
         uaddlp          v4.8h,   v4.16b
         uaddlp          v5.8h,   v5.16b
         uaddlp          v6.8h,   v6.16b
         uaddlp          v7.8h,   v7.16b
         add             v0.8h,   v0.8h,   v2.8h
         add             v1.8h,   v1.8h,   v3.8h
         add             v4.8h,   v4.8h,   v6.8h
         add             v5.8h,   v5.8h,   v7.8h
         shl             v0.8h,   v0.8h,   #1
         shl             v1.8h,   v1.8h,   #1
         shl             v2.8h,   v4.8h,   #1
         shl             v3.8h,   v5.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_420_w16_wpad1):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             d1,  [x1,  #16]
         ld1             {v0.16b}, [x1],  x2
         ldr             d3,  [x10, #16]
         ld1             {v2.16b}, [x10], x2
         uaddlp          v1.4h,   v1.8b
         ldr             d5,  [x1,  #16]
         uaddlp          v0.8h,   v0.16b
         ld1             {v4.16b}, [x1],  x2
         uaddlp          v3.4h,   v3.8b
         ldr             d7,  [x10, #16]
         uaddlp          v2.8h,   v2.16b
         ld1             {v6.16b}, [x10], x2
         uaddlp          v5.4h,   v5.8b
         uaddlp          v4.8h,   v4.16b
         uaddlp          v7.4h,   v7.8b
         uaddlp          v6.8h,   v6.16b
         add             v1.4h,   v1.4h,   v3.4h
         add             v0.8h,   v0.8h,   v2.8h
         add             v5.4h,   v5.4h,   v7.4h
         add             v4.8h,   v4.8h,   v6.8h
         shl             v1.4h,   v1.4h,   #1
         shl             v0.8h,   v0.8h,   #1
         shl             v3.4h,   v5.4h,   #1
         shl             v2.8h,   v4.8h,   #1
         dup             v4.4h,   v1.h[3]
         dup             v5.4h,   v3.h[3]
         trn1            v1.2d,   v1.2d,   v4.2d
         trn1            v3.2d,   v3.2d,   v5.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_420_w16_wpad2):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         ld1             {v4.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v6.16b}, [x10], x2
         uaddlp          v2.8h,   v2.16b
         uaddlp          v4.8h,   v4.16b
         uaddlp          v6.8h,   v6.16b
         add             v0.8h,   v0.8h,   v2.8h
         add             v4.8h,   v4.8h,   v6.8h
         shl             v0.8h,   v0.8h,   #1
         shl             v2.8h,   v4.8h,   #1
         dup             v1.8h,   v0.h[7]
         dup             v3.8h,   v2.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_420_w16_wpad3):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8b}, [x1],  x2
         ld1             {v2.8b}, [x10], x2
         ld1             {v4.8b}, [x1],  x2
         uaddlp          v0.4h,   v0.8b
         ld1             {v6.8b}, [x10], x2
         uaddlp          v2.4h,   v2.8b
         uaddlp          v4.4h,   v4.8b
         uaddlp          v6.4h,   v6.8b
         add             v0.4h,   v0.4h,   v2.4h
         add             v4.4h,   v4.4h,   v6.4h
         shl             v0.4h,   v0.4h,   #1
         shl             v2.4h,   v4.4h,   #1
         dup             v1.8h,   v0.h[3]
         dup             v3.8h,   v2.h[3]
         trn1            v0.2d,   v0.2d,   v1.2d
         trn1            v2.2d,   v2.2d,   v3.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b

 L(ipred_cfl_ac_420_w16_hpad):
         cbz             w4,  3f
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            2b
 3:

         // Double the height and reuse the w8 summing/subtracting
         lsl             w6,  w6,  #1
         b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)

 L(ipred_cfl_ac_420_tbl):
         .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
         .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
         .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
         .hword 0

 L(ipred_cfl_ac_420_w16_tbl):
         .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
         .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
         .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
         .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
 endfunc

 // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_422_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
         adr             x7,  L(ipred_cfl_ac_422_tbl)
         sub             w8,  w8,  #27
         ldrh            w8,  [x7, w8, uxtw #1]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
         sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
         clz             w9,  w9              // ctz(width)
         clz             w10, w10             // ctz(height)
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7

 L(ipred_cfl_ac_422_w4):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8b},   [x1],  x2
         ld1             {v0.d}[1], [x10], x2
         ld1             {v1.8b},   [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v1.8h,   v1.8h,   #2
         subs            w8,  w8,  #4
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)

 L(ipred_cfl_ac_422_w8):
         AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v1.16b}, [x10], x2
         ld1             {v2.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v3.16b}, [x10], x2
         uaddlp          v1.8h,   v1.16b
         uaddlp          v2.8h,   v2.16b
         uaddlp          v3.8h,   v3.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v1.8h,   v1.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         shl             v3.8h,   v3.8h,   #2
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)

 L(ipred_cfl_ac_422_w8_wpad):
 1:      // Copy and subsample input, padding 4
         ld1             {v0.8b},   [x1],  x2
         ld1             {v0.d}[1], [x10], x2
         ld1             {v2.8b},   [x1],  x2
         ld1             {v2.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v2.8h,   v2.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         dup             v4.4h,   v0.h[3]
         dup             v5.8h,   v0.h[7]
         dup             v6.4h,   v2.h[3]
         dup             v7.8h,   v2.h[7]
         trn2            v1.2d,   v0.2d,   v5.2d
         trn1            v0.2d,   v0.2d,   v4.2d
         trn2            v3.2d,   v2.2d,   v7.2d
         trn1            v2.2d,   v2.2d,   v6.2d
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)

 L(ipred_cfl_ac_422_w16):
         AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7

 L(ipred_cfl_ac_422_w16_wpad0):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b, v1.16b}, [x1],  x2
         ld1             {v2.16b, v3.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         uaddlp          v2.8h,   v2.16b
         uaddlp          v3.8h,   v3.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v1.8h,   v1.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         shl             v3.8h,   v3.8h,   #2
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_422_w16_wpad1):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             d1,  [x1,  #16]
         ld1             {v0.16b}, [x1],  x2
         ldr             d3,  [x10, #16]
         ld1             {v2.16b}, [x10], x2
         uaddlp          v1.4h,   v1.8b
         uaddlp          v0.8h,   v0.16b
         uaddlp          v3.4h,   v3.8b
         uaddlp          v2.8h,   v2.16b
         shl             v1.4h,   v1.4h,   #2
         shl             v0.8h,   v0.8h,   #2
         shl             v3.4h,   v3.4h,   #2
         shl             v2.8h,   v2.8h,   #2
         dup             v4.4h,   v1.h[3]
         dup             v5.4h,   v3.h[3]
         trn1            v1.2d,   v1.2d,   v4.2d
         trn1            v3.2d,   v3.2d,   v5.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_422_w16_wpad2):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v2.8h,   v2.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         dup             v1.8h,   v0.h[7]
         dup             v3.8h,   v2.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_422_w16_wpad3):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8b}, [x1],  x2
         ld1             {v2.8b}, [x10], x2
         uaddlp          v0.4h,   v0.8b
         uaddlp          v2.4h,   v2.8b
         shl             v0.4h,   v0.4h,   #2
         shl             v2.4h,   v2.4h,   #2
         dup             v1.8h,   v0.h[3]
         dup             v3.8h,   v2.h[3]
         trn1            v0.2d,   v0.2d,   v1.2d
         trn1            v2.2d,   v2.2d,   v3.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_422_tbl):
         .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
         .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
         .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
         .hword 0

 L(ipred_cfl_ac_422_w16_tbl):
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
         .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
 endfunc

 // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
 //                           const ptrdiff_t stride, const int w_pad,
 //                           const int h_pad, const int cw, const int ch);
 function ipred_cfl_ac_444_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
         adr             x7,  L(ipred_cfl_ac_444_tbl)
         sub             w8,  w8,  #26
         ldrh            w8,  [x7, w8, uxtw #1]
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
         sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
         clz             w9,  w9              // ctz(width)
         clz             w10, w10             // ctz(height)
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7

 L(ipred_cfl_ac_444_w4):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.s}[0], [x1],  x2
         ld1             {v0.s}[1], [x10], x2
         ld1             {v1.s}[0], [x1],  x2
         ld1             {v1.s}[1], [x10], x2
         ushll           v0.8h,   v0.8b,   #3
         ushll           v1.8h,   v1.8b,   #3
         subs            w8,  w8,  #4
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)

 L(ipred_cfl_ac_444_w8):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.8b}, [x1],  x2
         ld1             {v1.8b}, [x10], x2
         ld1             {v2.8b}, [x1],  x2
         ushll           v0.8h,   v0.8b,   #3
         ld1             {v3.8b}, [x10], x2
         ushll           v1.8h,   v1.8b,   #3
         ushll           v2.8h,   v2.8b,   #3
         ushll           v3.8h,   v3.8b,   #3
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)

 L(ipred_cfl_ac_444_w16):
         AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
 1:      // Copy and expand input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         ld1             {v4.16b}, [x1],  x2
         ushll2          v1.8h,   v0.16b,  #3
         ushll           v0.8h,   v0.8b,   #3
         ld1             {v6.16b}, [x10], x2
         ushll2          v3.8h,   v2.16b,  #3
         ushll           v2.8h,   v2.8b,   #3
         ushll2          v5.8h,   v4.16b,  #3
         ushll           v4.8h,   v4.8b,   #3
         ushll2          v7.8h,   v6.16b,  #3
         ushll           v6.8h,   v6.8b,   #3
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         mov             v0.16b,  v6.16b
         mov             v1.16b,  v7.16b
         mov             v2.16b,  v6.16b
         mov             v3.16b,  v7.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_444_w16_wpad):
 1:      // Copy and expand input, padding 8
         ld1             {v0.8b}, [x1],  x2
         ld1             {v2.8b}, [x10], x2
         ld1             {v4.8b}, [x1],  x2
         ld1             {v6.8b}, [x10], x2
         ushll           v0.8h,   v0.8b,   #3
         ushll           v2.8h,   v2.8b,   #3
         ushll           v4.8h,   v4.8b,   #3
         ushll           v6.8h,   v6.8b,   #3
         dup             v1.8h,   v0.h[7]
         dup             v3.8h,   v2.h[7]
         dup             v5.8h,   v4.h[7]
         dup             v7.8h,   v6.h[7]
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         mov             v0.16b,  v6.16b
         mov             v1.16b,  v7.16b
         mov             v2.16b,  v6.16b
         mov             v3.16b,  v7.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

 L(ipred_cfl_ac_444_w32):
         AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
         ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
         sub             x7,  x7,  w3, uxtw
         br              x7

 L(ipred_cfl_ac_444_w32_wpad0):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, without padding
         ld1             {v2.16b, v3.16b}, [x1],  x2
         ld1             {v6.16b, v7.16b}, [x10], x2
         ushll           v0.8h,   v2.8b,   #3
         ushll2          v1.8h,   v2.16b,  #3
         ushll           v2.8h,   v3.8b,   #3
         ushll2          v3.8h,   v3.16b,  #3
         ushll           v4.8h,   v6.8b,   #3
         ushll2          v5.8h,   v6.16b,  #3
         ushll           v6.8h,   v7.8b,   #3
         ushll2          v7.8h,   v7.16b,  #3
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)

 L(ipred_cfl_ac_444_w32_wpad2):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 8
         ldr             d2,  [x1,  #16]
         ld1             {v1.16b}, [x1],  x2
         ldr             d6,  [x10, #16]
         ld1             {v5.16b}, [x10], x2
         ushll           v2.8h,   v2.8b,   #3
         ushll           v0.8h,   v1.8b,   #3
         ushll2          v1.8h,   v1.16b,  #3
         ushll           v6.8h,   v6.8b,   #3
         ushll           v4.8h,   v5.8b,   #3
         ushll2          v5.8h,   v5.16b,  #3
         dup             v3.8h,   v2.h[7]
         dup             v7.8h,   v6.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)

 L(ipred_cfl_ac_444_w32_wpad4):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 16
         ld1             {v1.16b}, [x1],  x2
         ld1             {v5.16b}, [x10], x2
         ushll           v0.8h,   v1.8b,   #3
         ushll2          v1.8h,   v1.16b,  #3
         ushll           v4.8h,   v5.8b,   #3
         ushll2          v5.8h,   v5.16b,  #3
         dup             v2.8h,   v1.h[7]
         dup             v3.8h,   v1.h[7]
         dup             v6.8h,   v5.h[7]
         dup             v7.8h,   v5.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)

 L(ipred_cfl_ac_444_w32_wpad6):
         AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 24
         ld1             {v0.8b}, [x1],  x2
         ld1             {v4.8b}, [x10], x2
         ushll           v0.8h,   v0.8b,   #3
         ushll           v4.8h,   v4.8b,   #3
         dup             v1.8h,   v0.h[7]
         dup             v2.8h,   v0.h[7]
         dup             v3.8h,   v0.h[7]
         dup             v5.8h,   v4.h[7]
         dup             v6.8h,   v4.h[7]
         dup             v7.8h,   v4.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v0.8h
         add             v17.8h,  v17.8h,  v1.8h
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b

 L(ipred_cfl_ac_444_w32_hpad):
         cbz             w4,  3f
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #2
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            2b
 3:

         // Quadruple the height and reuse the w8 subtracting
         lsl             w6,  w6,  #2
         // Aggregate the sums, with wider intermediates earlier than in
         // ipred_cfl_ac_420_w8_calc_subtract_dc.
         uaddlp          v0.4s,   v16.8h
         uaddlp          v1.4s,   v17.8h
         uaddlp          v2.4s,   v18.8h
         uaddlp          v3.4s,   v19.8h
         add             v0.4s,   v0.4s,   v1.4s
         add             v2.4s,   v2.4s,   v3.4s
         add             v0.4s,   v0.4s,   v2.4s
         addv            s0,  v0.4s                // sum
         sub             x0,  x0,  w6, uxtw #4
         urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
         b               L(ipred_cfl_ac_420_w8_subtract_dc)

 L(ipred_cfl_ac_444_tbl):
         .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
         .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
         .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
         .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)

 L(ipred_cfl_ac_444_w32_tbl):
         .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
         .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
         .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
         .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
 endfunc