| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| EXPORT |vp8_loop_filter_horizontal_edge_armv6| |
| EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| |
| EXPORT |vp8_loop_filter_vertical_edge_armv6| |
| EXPORT |vp8_mbloop_filter_vertical_edge_armv6| |
| |
| AREA |.text|, CODE, READONLY ; name this block of code |
| |
| MACRO |
| TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 |
| ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 |
| ; a0: 03 02 01 00 |
| ; a1: 13 12 11 10 |
| ; a2: 23 22 21 20 |
| ; a3: 33 32 31 30 |
| ; b3 b2 b1 b0 |
| |
| uxtb16 $b1, $a1 ; xx 12 xx 10 |
| uxtb16 $b0, $a0 ; xx 02 xx 00 |
| uxtb16 $b3, $a3 ; xx 32 xx 30 |
| uxtb16 $b2, $a2 ; xx 22 xx 20 |
| orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 |
| orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 |
| |
| uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 |
| uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 |
| uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 |
| uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 |
| orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 |
| orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 |
| |
| pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 |
| pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 |
| |
| pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 |
| pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 |
| MEND |
| |
| |
| src RN r0 |
| pstep RN r1 |
| count RN r5 |
| |
| ;r0 unsigned char *src_ptr, |
| ;r1 int src_pixel_step, |
| ;r2 const char *blimit, |
| ;r3 const char *limit, |
| ;stack const char *thresh, |
| ;stack int count |
| |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| |vp8_loop_filter_horizontal_edge_armv6| PROC |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| stmdb sp!, {r4 - r11, lr} |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| ldr count, [sp, #40] ; count for 8-in-parallel |
| ldr r6, [sp, #36] ; load thresh address |
| sub sp, sp, #16 ; create temp buffer |
| |
| ldr r9, [src], pstep ; p3 |
| ldrb r4, [r2] ; blimit |
| ldr r10, [src], pstep ; p2 |
| ldrb r2, [r3] ; limit |
| ldr r11, [src], pstep ; p1 |
| orr r4, r4, r4, lsl #8 |
| ldrb r3, [r6] ; thresh |
| orr r2, r2, r2, lsl #8 |
| mov count, count, lsl #1 ; 4-in-parallel |
| orr r4, r4, r4, lsl #16 |
| orr r3, r3, r3, lsl #8 |
| orr r2, r2, r2, lsl #16 |
| orr r3, r3, r3, lsl #16 |
| |
| |Hnext8| |
| ; vp8_filter_mask() function |
| ; calculate breakout conditions |
| ldr r12, [src], pstep ; p0 |
| |
| uqsub8 r6, r9, r10 ; p3 - p2 |
| uqsub8 r7, r10, r9 ; p2 - p3 |
| uqsub8 r8, r10, r11 ; p2 - p1 |
| uqsub8 r10, r11, r10 ; p1 - p2 |
| |
| orr r6, r6, r7 ; abs (p3-p2) |
| orr r8, r8, r10 ; abs (p2-p1) |
| uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask |
| uqsub8 r8, r8, r2 ; compare to limit |
| uqsub8 r6, r11, r12 ; p1 - p0 |
| orr lr, lr, r8 |
| uqsub8 r7, r12, r11 ; p0 - p1 |
| ldr r9, [src], pstep ; q0 |
| ldr r10, [src], pstep ; q1 |
| orr r6, r6, r7 ; abs (p1-p0) |
| uqsub8 r7, r6, r2 ; compare to limit |
| uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later |
| orr lr, lr, r7 |
| |
| uqsub8 r6, r11, r10 ; p1 - q1 |
| uqsub8 r7, r10, r11 ; q1 - p1 |
| uqsub8 r11, r12, r9 ; p0 - q0 |
| uqsub8 r12, r9, r12 ; q0 - p0 |
| orr r6, r6, r7 ; abs (p1-q1) |
| ldr r7, c0x7F7F7F7F |
| orr r12, r11, r12 ; abs (p0-q0) |
| ldr r11, [src], pstep ; q2 |
| uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 |
| and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 |
| uqsub8 r7, r9, r10 ; q0 - q1 |
| uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 |
| uqsub8 r6, r10, r9 ; q1 - q0 |
| uqsub8 r12, r12, r4 ; compare to flimit |
| uqsub8 r9, r11, r10 ; q2 - q1 |
| |
| orr lr, lr, r12 |
| |
| ldr r12, [src], pstep ; q3 |
| uqsub8 r10, r10, r11 ; q1 - q2 |
| orr r6, r7, r6 ; abs (q1-q0) |
| orr r10, r9, r10 ; abs (q2-q1) |
| uqsub8 r7, r6, r2 ; compare to limit |
| uqsub8 r10, r10, r2 ; compare to limit |
| uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later |
| orr lr, lr, r7 |
| orr lr, lr, r10 |
| |
| uqsub8 r10, r12, r11 ; q3 - q2 |
| uqsub8 r9, r11, r12 ; q2 - q3 |
| |
| mvn r11, #0 ; r11 == -1 |
| |
| orr r10, r10, r9 ; abs (q3-q2) |
| uqsub8 r10, r10, r2 ; compare to limit |
| |
| mov r12, #0 |
| orr lr, lr, r10 |
| sub src, src, pstep, lsl #2 |
| |
| usub8 lr, r12, lr ; use usub8 instead of ssub8 |
| sel lr, r11, r12 ; filter mask: lr |
| |
| cmp lr, #0 |
| beq hskip_filter ; skip filtering |
| |
| sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines |
| |
| ;vp8_hevmask() function |
| ;calculate high edge variance |
| orr r10, r6, r8 ; calculate vp8_hevmask |
| |
| ldr r7, [src], pstep ; p1 |
| |
| usub8 r10, r12, r10 ; use usub8 instead of ssub8 |
| sel r6, r12, r11 ; obtain vp8_hevmask: r6 |
| |
| ;vp8_filter() function |
| ldr r8, [src], pstep ; p0 |
| ldr r12, c0x80808080 |
| ldr r9, [src], pstep ; q0 |
| ldr r10, [src], pstep ; q1 |
| |
| eor r7, r7, r12 ; p1 offset to convert to a signed value |
| eor r8, r8, r12 ; p0 offset to convert to a signed value |
| eor r9, r9, r12 ; q0 offset to convert to a signed value |
| eor r10, r10, r12 ; q1 offset to convert to a signed value |
| |
| str r9, [sp] ; store qs0 temporarily |
| str r8, [sp, #4] ; store ps0 temporarily |
| str r10, [sp, #8] ; store qs1 temporarily |
| str r7, [sp, #12] ; store ps1 temporarily |
| |
| qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) |
| qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) |
| |
| and r7, r7, r6 ; vp8_filter (r7) &= hev |
| |
| qadd8 r7, r7, r8 |
| ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 |
| |
| qadd8 r7, r7, r8 |
| ldr r10, c0x04040404 |
| |
| qadd8 r7, r7, r8 |
| and r7, r7, lr ; vp8_filter &= mask; |
| |
| ;modify code for vp8 -- Filter1 = vp8_filter (r7) |
| qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) |
| qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) |
| |
| mov r9, #0 |
| shadd8 r8 , r8 , r9 ; Filter2 >>= 3 |
| shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 |
| shadd8 r8 , r8 , r9 |
| shadd8 r7 , r7 , r9 |
| shadd8 lr , r8 , r9 ; lr: Filter2 |
| shadd8 r7 , r7 , r9 ; r7: filter |
| |
| ;usub8 lr, r8, r10 ; s = (s==4)*-1 |
| ;sel lr, r11, r9 |
| ;usub8 r8, r10, r8 |
| ;sel r8, r11, r9 |
| ;and r8, r8, lr ; -1 for each element that equals 4 |
| |
| ;calculate output |
| ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) |
| |
| ldr r8, [sp] ; load qs0 |
| ldr r9, [sp, #4] ; load ps0 |
| |
| ldr r10, c0x01010101 |
| |
| qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) |
| qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) |
| |
| ;end of modification for vp8 |
| |
| mov lr, #0 |
| sadd8 r7, r7 , r10 ; vp8_filter += 1 |
| shadd8 r7, r7, lr ; vp8_filter >>= 1 |
| |
| ldr r11, [sp, #12] ; load ps1 |
| ldr r10, [sp, #8] ; load qs1 |
| |
| bic r7, r7, r6 ; vp8_filter &= ~hev |
| sub src, src, pstep, lsl #2 |
| |
| qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) |
| qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) |
| |
| eor r11, r11, r12 ; *op1 = u^0x80 |
| str r11, [src], pstep ; store op1 |
| eor r9, r9, r12 ; *op0 = u^0x80 |
| str r9, [src], pstep ; store op0 result |
| eor r8, r8, r12 ; *oq0 = u^0x80 |
| str r8, [src], pstep ; store oq0 result |
| eor r10, r10, r12 ; *oq1 = u^0x80 |
| str r10, [src], pstep ; store oq1 |
| |
| sub src, src, pstep, lsl #1 |
| |
| |hskip_filter| |
| add src, src, #4 |
| sub src, src, pstep, lsl #2 |
| |
| subs count, count, #1 |
| |
| ldrne r9, [src], pstep ; p3 |
| ldrne r10, [src], pstep ; p2 |
| ldrne r11, [src], pstep ; p1 |
| |
| bne Hnext8 |
| |
| add sp, sp, #16 |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_loop_filter_horizontal_edge_armv6| |
| |
| |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| |vp8_mbloop_filter_horizontal_edge_armv6| PROC |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| stmdb sp!, {r4 - r11, lr} |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| ldr count, [sp, #40] ; count for 8-in-parallel |
| ldr r6, [sp, #36] ; load thresh address |
| sub sp, sp, #16 ; create temp buffer |
| |
| ldr r9, [src], pstep ; p3 |
| ldrb r4, [r2] ; blimit |
| ldr r10, [src], pstep ; p2 |
| ldrb r2, [r3] ; limit |
| ldr r11, [src], pstep ; p1 |
| orr r4, r4, r4, lsl #8 |
| ldrb r3, [r6] ; thresh |
| orr r2, r2, r2, lsl #8 |
| mov count, count, lsl #1 ; 4-in-parallel |
| orr r4, r4, r4, lsl #16 |
| orr r3, r3, r3, lsl #8 |
| orr r2, r2, r2, lsl #16 |
| orr r3, r3, r3, lsl #16 |
| |
| |MBHnext8| |
| |
| ; vp8_filter_mask() function |
| ; calculate breakout conditions |
| ldr r12, [src], pstep ; p0 |
| |
| uqsub8 r6, r9, r10 ; p3 - p2 |
| uqsub8 r7, r10, r9 ; p2 - p3 |
| uqsub8 r8, r10, r11 ; p2 - p1 |
| uqsub8 r10, r11, r10 ; p1 - p2 |
| |
| orr r6, r6, r7 ; abs (p3-p2) |
| orr r8, r8, r10 ; abs (p2-p1) |
| uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask |
| uqsub8 r8, r8, r2 ; compare to limit |
| |
| uqsub8 r6, r11, r12 ; p1 - p0 |
| orr lr, lr, r8 |
| uqsub8 r7, r12, r11 ; p0 - p1 |
| ldr r9, [src], pstep ; q0 |
| ldr r10, [src], pstep ; q1 |
| orr r6, r6, r7 ; abs (p1-p0) |
| uqsub8 r7, r6, r2 ; compare to limit |
| uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later |
| orr lr, lr, r7 |
| |
| uqsub8 r6, r11, r10 ; p1 - q1 |
| uqsub8 r7, r10, r11 ; q1 - p1 |
| uqsub8 r11, r12, r9 ; p0 - q0 |
| uqsub8 r12, r9, r12 ; q0 - p0 |
| orr r6, r6, r7 ; abs (p1-q1) |
| ldr r7, c0x7F7F7F7F |
| orr r12, r11, r12 ; abs (p0-q0) |
| ldr r11, [src], pstep ; q2 |
| uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 |
| and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 |
| uqsub8 r7, r9, r10 ; q0 - q1 |
| uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 |
| uqsub8 r6, r10, r9 ; q1 - q0 |
| uqsub8 r12, r12, r4 ; compare to flimit |
| uqsub8 r9, r11, r10 ; q2 - q1 |
| |
| orr lr, lr, r12 |
| |
| ldr r12, [src], pstep ; q3 |
| |
| uqsub8 r10, r10, r11 ; q1 - q2 |
| orr r6, r7, r6 ; abs (q1-q0) |
| orr r10, r9, r10 ; abs (q2-q1) |
| uqsub8 r7, r6, r2 ; compare to limit |
| uqsub8 r10, r10, r2 ; compare to limit |
| uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later |
| orr lr, lr, r7 |
| orr lr, lr, r10 |
| |
| uqsub8 r10, r12, r11 ; q3 - q2 |
| uqsub8 r9, r11, r12 ; q2 - q3 |
| |
| mvn r11, #0 ; r11 == -1 |
| |
| orr r10, r10, r9 ; abs (q3-q2) |
| uqsub8 r10, r10, r2 ; compare to limit |
| |
| mov r12, #0 |
| |
| orr lr, lr, r10 |
| |
| usub8 lr, r12, lr ; use usub8 instead of ssub8 |
| sel lr, r11, r12 ; filter mask: lr |
| |
| cmp lr, #0 |
| beq mbhskip_filter ; skip filtering |
| |
| ;vp8_hevmask() function |
| ;calculate high edge variance |
| sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines |
| sub src, src, pstep, lsl #1 |
| |
| orr r10, r6, r8 |
| ldr r7, [src], pstep ; p1 |
| |
| usub8 r10, r12, r10 |
| sel r6, r12, r11 ; hev mask: r6 |
| |
| ;vp8_mbfilter() function |
| ;p2, q2 are only needed at the end. Don't need to load them in now. |
| ldr r8, [src], pstep ; p0 |
| ldr r12, c0x80808080 |
| ldr r9, [src], pstep ; q0 |
| ldr r10, [src] ; q1 |
| |
| eor r7, r7, r12 ; ps1 |
| eor r8, r8, r12 ; ps0 |
| eor r9, r9, r12 ; qs0 |
| eor r10, r10, r12 ; qs1 |
| |
| qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) |
| str r7, [sp, #12] ; store ps1 temporarily |
| qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) |
| str r10, [sp, #8] ; store qs1 temporarily |
| qadd8 r7, r7, r12 |
| str r9, [sp] ; store qs0 temporarily |
| qadd8 r7, r7, r12 |
| str r8, [sp, #4] ; store ps0 temporarily |
| qadd8 r7, r7, r12 ; vp8_filter: r7 |
| |
| ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 |
| ldr r9, c0x04040404 |
| |
| and r7, r7, lr ; vp8_filter &= mask (lr is free) |
| |
| mov r12, r7 ; Filter2: r12 |
| and r12, r12, r6 ; Filter2 &= hev |
| |
| ;modify code for vp8 |
| ;save bottom 3 bits so that we round one side +4 and the other +3 |
| qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) |
| qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) |
| |
| mov r10, #0 |
| shadd8 r8 , r8 , r10 ; Filter1 >>= 3 |
| shadd8 r12 , r12 , r10 ; Filter2 >>= 3 |
| shadd8 r8 , r8 , r10 |
| shadd8 r12 , r12 , r10 |
| shadd8 r8 , r8 , r10 ; r8: Filter1 |
| shadd8 r12 , r12 , r10 ; r12: Filter2 |
| |
| ldr r9, [sp] ; load qs0 |
| ldr r11, [sp, #4] ; load ps0 |
| |
| qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) |
| qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) |
| |
| ;save bottom 3 bits so that we round one side +4 and the other +3 |
| ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) |
| ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) |
| ;mov r10, #0 |
| ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 |
| ;usub8 lr, r8, r9 ; s = (s==4)*-1 |
| ;sel lr, r11, r10 |
| ;shadd8 r12 , r12 , r10 |
| ;usub8 r8, r9, r8 |
| ;sel r8, r11, r10 |
| ;ldr r9, [sp] ; load qs0 |
| ;ldr r11, [sp, #4] ; load ps0 |
| ;shadd8 r12 , r12 , r10 |
| ;and r8, r8, lr ; -1 for each element that equals 4 |
| ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) |
| ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) |
| ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) |
| |
| ;end of modification for vp8 |
| |
| bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) |
| ;mov r12, r7 |
| |
| ;roughly 3/7th difference across boundary |
| mov lr, #0x1b ; 27 |
| mov r7, #0x3f ; 63 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r7, r10, lr, r7 |
| smultb r10, r10, lr |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| add r10, r10, #63 |
| ssat r7, #8, r7, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| ldr lr, c0x80808080 |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r7, r10, lsl #16 |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| sub src, src, pstep |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) |
| |
| qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) |
| qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) |
| eor r8, r8, lr ; *oq0 = s^0x80 |
| str r8, [src] ; store *oq0 |
| sub src, src, pstep |
| eor r10, r10, lr ; *op0 = s^0x80 |
| str r10, [src] ; store *op0 |
| |
| ;roughly 2/7th difference across boundary |
| mov lr, #0x12 ; 18 |
| mov r7, #0x3f ; 63 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r9, r10, lr, r7 |
| smlatb r10, r10, lr, r7 |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| ssat r9, #8, r9, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| ldr lr, c0x80808080 |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r9, r10, lsl #16 |
| |
| ldr r9, [sp, #8] ; load qs1 |
| ldr r11, [sp, #12] ; load ps1 |
| |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| sub src, src, pstep |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) |
| |
| qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) |
| qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) |
| eor r11, r11, lr ; *op1 = s^0x80 |
| str r11, [src], pstep ; store *op1 |
| eor r8, r8, lr ; *oq1 = s^0x80 |
| add src, src, pstep, lsl #1 |
| |
| mov r7, #0x3f ; 63 |
| |
| str r8, [src], pstep ; store *oq1 |
| |
| ;roughly 1/7th difference across boundary |
| mov lr, #0x9 ; 9 |
| ldr r9, [src] ; load q2 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r12, r10, lr, r7 |
| smlatb r10, r10, lr, r7 |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| ssat r12, #8, r12, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| sub src, src, pstep, lsl #2 |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r12, r10, lsl #16 |
| |
| sub src, src, pstep |
| ldr lr, c0x80808080 |
| |
| ldr r11, [src] ; load p2 |
| |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| eor r9, r9, lr |
| eor r11, r11, lr |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) |
| |
| qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) |
| qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) |
| eor r8, r8, lr ; *op2 = s^0x80 |
| str r8, [src], pstep, lsl #2 ; store *op2 |
| add src, src, pstep |
| eor r10, r10, lr ; *oq2 = s^0x80 |
| str r10, [src], pstep, lsl #1 ; store *oq2 |
| |
| |mbhskip_filter| |
| add src, src, #4 |
| sub src, src, pstep, lsl #3 |
| subs count, count, #1 |
| |
| ldrne r9, [src], pstep ; p3 |
| ldrne r10, [src], pstep ; p2 |
| ldrne r11, [src], pstep ; p1 |
| |
| bne MBHnext8 |
| |
| add sp, sp, #16 |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| |
| |
| |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| |vp8_loop_filter_vertical_edge_armv6| PROC |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| stmdb sp!, {r4 - r11, lr} |
| |
| sub src, src, #4 ; move src pointer down by 4 |
| ldr count, [sp, #40] ; count for 8-in-parallel |
| ldr r12, [sp, #36] ; load thresh address |
| sub sp, sp, #16 ; create temp buffer |
| |
| ldr r6, [src], pstep ; load source data |
| ldrb r4, [r2] ; blimit |
| ldr r7, [src], pstep |
| ldrb r2, [r3] ; limit |
| ldr r8, [src], pstep |
| orr r4, r4, r4, lsl #8 |
| ldrb r3, [r12] ; thresh |
| orr r2, r2, r2, lsl #8 |
| ldr lr, [src], pstep |
| mov count, count, lsl #1 ; 4-in-parallel |
| orr r4, r4, r4, lsl #16 |
| orr r3, r3, r3, lsl #8 |
| orr r2, r2, r2, lsl #16 |
| orr r3, r3, r3, lsl #16 |
| |
| |Vnext8| |
| |
| ; vp8_filter_mask() function |
| ; calculate breakout conditions |
| ; transpose the source data for 4-in-parallel operation |
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 |
| |
| uqsub8 r7, r9, r10 ; p3 - p2 |
| uqsub8 r8, r10, r9 ; p2 - p3 |
| uqsub8 r9, r10, r11 ; p2 - p1 |
| uqsub8 r10, r11, r10 ; p1 - p2 |
| orr r7, r7, r8 ; abs (p3-p2) |
| orr r10, r9, r10 ; abs (p2-p1) |
| uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask |
| uqsub8 r10, r10, r2 ; compare to limit |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| orr lr, lr, r10 |
| |
| uqsub8 r6, r11, r12 ; p1 - p0 |
| uqsub8 r7, r12, r11 ; p0 - p1 |
| add src, src, #4 ; move src pointer up by 4 |
| orr r6, r6, r7 ; abs (p1-p0) |
| str r11, [sp, #12] ; save p1 |
| uqsub8 r10, r6, r2 ; compare to limit |
| uqsub8 r11, r6, r3 ; compare to thresh |
| orr lr, lr, r10 |
| |
| ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now |
| ; transpose the source data for 4-in-parallel operation |
| ldr r6, [src], pstep ; load source data |
| str r11, [sp] ; push r11 to stack |
| ldr r7, [src], pstep |
| str r12, [sp, #4] ; save current reg before load q0 - q3 data |
| ldr r8, [src], pstep |
| str lr, [sp, #8] |
| ldr lr, [src], pstep |
| |
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 |
| |
| ldr lr, [sp, #8] ; load back (f)limit accumulator |
| |
| uqsub8 r6, r12, r11 ; q3 - q2 |
| uqsub8 r7, r11, r12 ; q2 - q3 |
| uqsub8 r12, r11, r10 ; q2 - q1 |
| uqsub8 r11, r10, r11 ; q1 - q2 |
| orr r6, r6, r7 ; abs (q3-q2) |
| orr r7, r12, r11 ; abs (q2-q1) |
| uqsub8 r6, r6, r2 ; compare to limit |
| uqsub8 r7, r7, r2 ; compare to limit |
| ldr r11, [sp, #4] ; load back p0 |
| ldr r12, [sp, #12] ; load back p1 |
| orr lr, lr, r6 |
| orr lr, lr, r7 |
| |
| uqsub8 r6, r11, r9 ; p0 - q0 |
| uqsub8 r7, r9, r11 ; q0 - p0 |
| uqsub8 r8, r12, r10 ; p1 - q1 |
| uqsub8 r11, r10, r12 ; q1 - p1 |
| orr r6, r6, r7 ; abs (p0-q0) |
| ldr r7, c0x7F7F7F7F |
| orr r8, r8, r11 ; abs (p1-q1) |
| uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 |
| and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 |
| uqsub8 r11, r10, r9 ; q1 - q0 |
| uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 |
| uqsub8 r12, r9, r10 ; q0 - q1 |
| uqsub8 r6, r6, r4 ; compare to flimit |
| |
| orr r9, r11, r12 ; abs (q1-q0) |
| uqsub8 r8, r9, r2 ; compare to limit |
| uqsub8 r10, r9, r3 ; compare to thresh |
| orr lr, lr, r6 |
| orr lr, lr, r8 |
| |
| mvn r11, #0 ; r11 == -1 |
| mov r12, #0 |
| |
| usub8 lr, r12, lr |
| ldr r9, [sp] ; load the compared result |
| sel lr, r11, r12 ; filter mask: lr |
| |
| cmp lr, #0 |
| beq vskip_filter ; skip filtering |
| |
| ;vp8_hevmask() function |
| ;calculate high edge variance |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| orr r9, r9, r10 |
| |
| ldrh r7, [src, #-2] |
| ldrh r8, [src], pstep |
| |
| usub8 r9, r12, r9 |
| sel r6, r12, r11 ; hev mask: r6 |
| |
| ;vp8_filter() function |
| ; load soure data to r6, r11, r12, lr |
| ldrh r9, [src, #-2] |
| ldrh r10, [src], pstep |
| |
| pkhbt r12, r7, r8, lsl #16 |
| |
| ldrh r7, [src, #-2] |
| ldrh r8, [src], pstep |
| |
| pkhbt r11, r9, r10, lsl #16 |
| |
| ldrh r9, [src, #-2] |
| ldrh r10, [src], pstep |
| |
| ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first |
| str r6, [sp] |
| str lr, [sp, #4] |
| |
| pkhbt r6, r7, r8, lsl #16 |
| pkhbt lr, r9, r10, lsl #16 |
| |
| ;transpose r12, r11, r6, lr to r7, r8, r9, r10 |
| TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 |
| |
| ;load back hev_mask r6 and filter_mask lr |
| ldr r12, c0x80808080 |
| ldr r6, [sp] |
| ldr lr, [sp, #4] |
| |
| eor r7, r7, r12 ; p1 offset to convert to a signed value |
| eor r8, r8, r12 ; p0 offset to convert to a signed value |
| eor r9, r9, r12 ; q0 offset to convert to a signed value |
| eor r10, r10, r12 ; q1 offset to convert to a signed value |
| |
| str r9, [sp] ; store qs0 temporarily |
| str r8, [sp, #4] ; store ps0 temporarily |
| str r10, [sp, #8] ; store qs1 temporarily |
| str r7, [sp, #12] ; store ps1 temporarily |
| |
| qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) |
| qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) |
| |
| and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) |
| |
| qadd8 r7, r7, r8 |
| ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 |
| |
| qadd8 r7, r7, r8 |
| ldr r10, c0x04040404 |
| |
| qadd8 r7, r7, r8 |
| ;mvn r11, #0 ; r11 == -1 |
| |
| and r7, r7, lr ; vp8_filter &= mask |
| |
| ;modify code for vp8 -- Filter1 = vp8_filter (r7) |
| qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) |
| qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) |
| |
| mov r9, #0 |
| shadd8 r8 , r8 , r9 ; Filter2 >>= 3 |
| shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 |
| shadd8 r8 , r8 , r9 |
| shadd8 r7 , r7 , r9 |
| shadd8 lr , r8 , r9 ; lr: filter2 |
| shadd8 r7 , r7 , r9 ; r7: filter |
| |
| ;usub8 lr, r8, r10 ; s = (s==4)*-1 |
| ;sel lr, r11, r9 |
| ;usub8 r8, r10, r8 |
| ;sel r8, r11, r9 |
| ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s |
| |
| ;calculate output |
| ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) |
| |
| ldr r8, [sp] ; load qs0 |
| ldr r9, [sp, #4] ; load ps0 |
| |
| ldr r10, c0x01010101 |
| |
| qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) |
| qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) |
| ;end of modification for vp8 |
| |
| eor r8, r8, r12 |
| eor r9, r9, r12 |
| |
| mov lr, #0 |
| |
| sadd8 r7, r7, r10 |
| shadd8 r7, r7, lr |
| |
| ldr r10, [sp, #8] ; load qs1 |
| ldr r11, [sp, #12] ; load ps1 |
| |
| bic r7, r7, r6 ; r7: vp8_filter |
| |
| qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) |
| qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) |
| eor r10, r10, r12 |
| eor r11, r11, r12 |
| |
| sub src, src, pstep, lsl #2 |
| |
| ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 |
| ;output is b0, b1, b2, b3 |
| ;b0: 03 02 01 00 |
| ;b1: 13 12 11 10 |
| ;b2: 23 22 21 20 |
| ;b3: 33 32 31 30 |
| ; p1 p0 q0 q1 |
| ; (a3 a2 a1 a0) |
| TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr |
| |
| strh r6, [src, #-2] ; store the result |
| mov r6, r6, lsr #16 |
| strh r6, [src], pstep |
| |
| strh r7, [src, #-2] |
| mov r7, r7, lsr #16 |
| strh r7, [src], pstep |
| |
| strh r12, [src, #-2] |
| mov r12, r12, lsr #16 |
| strh r12, [src], pstep |
| |
| strh lr, [src, #-2] |
| mov lr, lr, lsr #16 |
| strh lr, [src], pstep |
| |
| |vskip_filter| |
| sub src, src, #4 |
| subs count, count, #1 |
| |
| ldrne r6, [src], pstep ; load source data |
| ldrne r7, [src], pstep |
| ldrne r8, [src], pstep |
| ldrne lr, [src], pstep |
| |
| bne Vnext8 |
| |
| add sp, sp, #16 |
| |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_loop_filter_vertical_edge_armv6| |
| |
| |
| |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| |vp8_mbloop_filter_vertical_edge_armv6| PROC |
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| stmdb sp!, {r4 - r11, lr} |
| |
| sub src, src, #4 ; move src pointer down by 4 |
| ldr count, [sp, #40] ; count for 8-in-parallel |
| ldr r12, [sp, #36] ; load thresh address |
| pld [src, #23] ; preload for next block |
| sub sp, sp, #16 ; create temp buffer |
| |
| ldr r6, [src], pstep ; load source data |
| ldrb r4, [r2] ; blimit |
| pld [src, #23] |
| ldr r7, [src], pstep |
| ldrb r2, [r3] ; limit |
| pld [src, #23] |
| ldr r8, [src], pstep |
| orr r4, r4, r4, lsl #8 |
| ldrb r3, [r12] ; thresh |
| orr r2, r2, r2, lsl #8 |
| pld [src, #23] |
| ldr lr, [src], pstep |
| mov count, count, lsl #1 ; 4-in-parallel |
| orr r4, r4, r4, lsl #16 |
| orr r3, r3, r3, lsl #8 |
| orr r2, r2, r2, lsl #16 |
| orr r3, r3, r3, lsl #16 |
| |
| |MBVnext8| |
| ; vp8_filter_mask() function |
| ; calculate breakout conditions |
| ; transpose the source data for 4-in-parallel operation |
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 |
| |
| uqsub8 r7, r9, r10 ; p3 - p2 |
| uqsub8 r8, r10, r9 ; p2 - p3 |
| uqsub8 r9, r10, r11 ; p2 - p1 |
| uqsub8 r10, r11, r10 ; p1 - p2 |
| orr r7, r7, r8 ; abs (p3-p2) |
| orr r10, r9, r10 ; abs (p2-p1) |
| uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask |
| uqsub8 r10, r10, r2 ; compare to limit |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| orr lr, lr, r10 |
| |
| uqsub8 r6, r11, r12 ; p1 - p0 |
| uqsub8 r7, r12, r11 ; p0 - p1 |
| add src, src, #4 ; move src pointer up by 4 |
| orr r6, r6, r7 ; abs (p1-p0) |
| str r11, [sp, #12] ; save p1 |
| uqsub8 r10, r6, r2 ; compare to limit |
| uqsub8 r11, r6, r3 ; compare to thresh |
| orr lr, lr, r10 |
| |
| ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now |
| ; transpose the source data for 4-in-parallel operation |
| ldr r6, [src], pstep ; load source data |
| str r11, [sp] ; push r11 to stack |
| ldr r7, [src], pstep |
| str r12, [sp, #4] ; save current reg before load q0 - q3 data |
| ldr r8, [src], pstep |
| str lr, [sp, #8] |
| ldr lr, [src], pstep |
| |
| |
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 |
| |
| ldr lr, [sp, #8] ; load back (f)limit accumulator |
| |
| uqsub8 r6, r12, r11 ; q3 - q2 |
| uqsub8 r7, r11, r12 ; q2 - q3 |
| uqsub8 r12, r11, r10 ; q2 - q1 |
| uqsub8 r11, r10, r11 ; q1 - q2 |
| orr r6, r6, r7 ; abs (q3-q2) |
| orr r7, r12, r11 ; abs (q2-q1) |
| uqsub8 r6, r6, r2 ; compare to limit |
| uqsub8 r7, r7, r2 ; compare to limit |
| ldr r11, [sp, #4] ; load back p0 |
| ldr r12, [sp, #12] ; load back p1 |
| orr lr, lr, r6 |
| orr lr, lr, r7 |
| |
| uqsub8 r6, r11, r9 ; p0 - q0 |
| uqsub8 r7, r9, r11 ; q0 - p0 |
| uqsub8 r8, r12, r10 ; p1 - q1 |
| uqsub8 r11, r10, r12 ; q1 - p1 |
| orr r6, r6, r7 ; abs (p0-q0) |
| ldr r7, c0x7F7F7F7F |
| orr r8, r8, r11 ; abs (p1-q1) |
| uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 |
| and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 |
| uqsub8 r11, r10, r9 ; q1 - q0 |
| uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 |
| uqsub8 r12, r9, r10 ; q0 - q1 |
| uqsub8 r6, r6, r4 ; compare to flimit |
| |
| orr r9, r11, r12 ; abs (q1-q0) |
| uqsub8 r8, r9, r2 ; compare to limit |
| uqsub8 r10, r9, r3 ; compare to thresh |
| orr lr, lr, r6 |
| orr lr, lr, r8 |
| |
| mvn r11, #0 ; r11 == -1 |
| mov r12, #0 |
| |
| usub8 lr, r12, lr |
| ldr r9, [sp] ; load the compared result |
| sel lr, r11, r12 ; filter mask: lr |
| |
| cmp lr, #0 |
| beq mbvskip_filter ; skip filtering |
| |
| |
| |
| ;vp8_hevmask() function |
| ;calculate high edge variance |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| orr r9, r9, r10 |
| |
| ldrh r7, [src, #-2] |
| ldrh r8, [src], pstep |
| |
| usub8 r9, r12, r9 |
| sel r6, r12, r11 ; hev mask: r6 |
| |
| |
| ; vp8_mbfilter() function |
| ; p2, q2 are only needed at the end. Don't need to load them in now. |
| ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first |
| ; load soure data to r6, r11, r12, lr |
| ldrh r9, [src, #-2] |
| ldrh r10, [src], pstep |
| |
| pkhbt r12, r7, r8, lsl #16 |
| |
| ldrh r7, [src, #-2] |
| ldrh r8, [src], pstep |
| |
| pkhbt r11, r9, r10, lsl #16 |
| |
| ldrh r9, [src, #-2] |
| ldrh r10, [src], pstep |
| |
| str r6, [sp] ; save r6 |
| str lr, [sp, #4] ; save lr |
| |
| pkhbt r6, r7, r8, lsl #16 |
| pkhbt lr, r9, r10, lsl #16 |
| |
| ;transpose r12, r11, r6, lr to p1, p0, q0, q1 |
| TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 |
| |
| ;load back hev_mask r6 and filter_mask lr |
| ldr r12, c0x80808080 |
| ldr r6, [sp] |
| ldr lr, [sp, #4] |
| |
| eor r7, r7, r12 ; ps1 |
| eor r8, r8, r12 ; ps0 |
| eor r9, r9, r12 ; qs0 |
| eor r10, r10, r12 ; qs1 |
| |
| qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) |
| str r7, [sp, #12] ; store ps1 temporarily |
| qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) |
| str r10, [sp, #8] ; store qs1 temporarily |
| qadd8 r7, r7, r12 |
| str r9, [sp] ; store qs0 temporarily |
| qadd8 r7, r7, r12 |
| str r8, [sp, #4] ; store ps0 temporarily |
| qadd8 r7, r7, r12 ; vp8_filter: r7 |
| |
| ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 |
| ldr r9, c0x04040404 |
| ;mvn r11, #0 ; r11 == -1 |
| |
| and r7, r7, lr ; vp8_filter &= mask (lr is free) |
| |
| mov r12, r7 ; Filter2: r12 |
| and r12, r12, r6 ; Filter2 &= hev |
| |
| ;modify code for vp8 |
| ;save bottom 3 bits so that we round one side +4 and the other +3 |
| qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) |
| qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) |
| |
| mov r10, #0 |
| shadd8 r8 , r8 , r10 ; Filter1 >>= 3 |
| shadd8 r12 , r12 , r10 ; Filter2 >>= 3 |
| shadd8 r8 , r8 , r10 |
| shadd8 r12 , r12 , r10 |
| shadd8 r8 , r8 , r10 ; r8: Filter1 |
| shadd8 r12 , r12 , r10 ; r12: Filter2 |
| |
| ldr r9, [sp] ; load qs0 |
| ldr r11, [sp, #4] ; load ps0 |
| |
| qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) |
| qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) |
| |
| ;save bottom 3 bits so that we round one side +4 and the other +3 |
| ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) |
| ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) |
| ;mov r10, #0 |
| ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 |
| ;usub8 lr, r8, r9 ; s = (s==4)*-1 |
| ;sel lr, r11, r10 |
| ;shadd8 r12 , r12 , r10 |
| ;usub8 r8, r9, r8 |
| ;sel r8, r11, r10 |
| ;ldr r9, [sp] ; load qs0 |
| ;ldr r11, [sp, #4] ; load ps0 |
| ;shadd8 r12 , r12 , r10 |
| ;and r8, r8, lr ; -1 for each element that equals 4 |
| ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) |
| ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) |
| ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) |
| |
| ;end of modification for vp8 |
| |
| bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) |
| ;mov r12, r7 |
| |
| ;roughly 3/7th difference across boundary |
| mov lr, #0x1b ; 27 |
| mov r7, #0x3f ; 63 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r7, r10, lr, r7 |
| smultb r10, r10, lr |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| add r10, r10, #63 |
| ssat r7, #8, r7, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| ldr lr, c0x80808080 |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r7, r10, lsl #16 |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) |
| |
| qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) |
| qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) |
| eor r8, r8, lr ; *oq0 = s^0x80 |
| eor r10, r10, lr ; *op0 = s^0x80 |
| |
| strb r10, [src, #-1] ; store op0 result |
| strb r8, [src], pstep ; store oq0 result |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| strb r10, [src, #-1] |
| strb r8, [src], pstep |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| strb r10, [src, #-1] |
| strb r8, [src], pstep |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| strb r10, [src, #-1] |
| strb r8, [src], pstep |
| |
| ;roughly 2/7th difference across boundary |
| mov lr, #0x12 ; 18 |
| mov r7, #0x3f ; 63 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r9, r10, lr, r7 |
| |
| smlatb r10, r10, lr, r7 |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| ssat r9, #8, r9, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r9, r10, lsl #16 |
| |
| ldr r9, [sp, #8] ; load qs1 |
| ldr r11, [sp, #12] ; load ps1 |
| ldr lr, c0x80808080 |
| |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| add src, src, #2 |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) |
| |
| qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) |
| qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) |
| eor r8, r8, lr ; *oq1 = s^0x80 |
| eor r10, r10, lr ; *op1 = s^0x80 |
| |
| ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary |
| strb r10, [src, #-4] ; store op1 |
| strb r8, [src, #-1] ; store oq1 |
| ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary |
| |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| |
| ldrb r6, [src, #-5] |
| strb r10, [src, #-4] |
| strb r8, [src, #-1] |
| ldrb r7, [src], pstep |
| |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| orr r11, r11, r6, lsl #8 |
| orr r9, r9, r7, lsl #8 |
| |
| ldrb r6, [src, #-5] |
| strb r10, [src, #-4] |
| strb r8, [src, #-1] |
| ldrb r7, [src], pstep |
| |
| mov r10, r10, lsr #8 |
| mov r8, r8, lsr #8 |
| orr r11, r11, r6, lsl #16 |
| orr r9, r9, r7, lsl #16 |
| |
| ldrb r6, [src, #-5] |
| strb r10, [src, #-4] |
| strb r8, [src, #-1] |
| ldrb r7, [src], pstep |
| orr r11, r11, r6, lsl #24 |
| orr r9, r9, r7, lsl #24 |
| |
| ;roughly 1/7th difference across boundary |
| eor r9, r9, lr |
| eor r11, r11, lr |
| |
| mov lr, #0x9 ; 9 |
| mov r7, #0x3f ; 63 |
| |
| sxtb16 r6, r12 |
| sxtb16 r10, r12, ror #8 |
| smlabb r8, r6, lr, r7 |
| smlatb r6, r6, lr, r7 |
| smlabb r12, r10, lr, r7 |
| smlatb r10, r10, lr, r7 |
| ssat r8, #8, r8, asr #7 |
| ssat r6, #8, r6, asr #7 |
| ssat r12, #8, r12, asr #7 |
| ssat r10, #8, r10, asr #7 |
| |
| sub src, src, pstep, lsl #2 |
| |
| pkhbt r6, r8, r6, lsl #16 |
| pkhbt r10, r12, r10, lsl #16 |
| |
| uxtb16 r6, r6 |
| uxtb16 r10, r10 |
| |
| ldr lr, c0x80808080 |
| |
| orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) |
| |
| qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) |
| qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) |
| eor r8, r8, lr ; *op2 = s^0x80 |
| eor r10, r10, lr ; *oq2 = s^0x80 |
| |
| strb r8, [src, #-5] ; store *op2 |
| strb r10, [src], pstep ; store *oq2 |
| mov r8, r8, lsr #8 |
| mov r10, r10, lsr #8 |
| strb r8, [src, #-5] |
| strb r10, [src], pstep |
| mov r8, r8, lsr #8 |
| mov r10, r10, lsr #8 |
| strb r8, [src, #-5] |
| strb r10, [src], pstep |
| mov r8, r8, lsr #8 |
| mov r10, r10, lsr #8 |
| strb r8, [src, #-5] |
| strb r10, [src], pstep |
| |
| ;adjust src pointer for next loop |
| sub src, src, #2 |
| |
| |mbvskip_filter| |
| sub src, src, #4 |
| subs count, count, #1 |
| |
| pld [src, #23] ; preload for next block |
| ldrne r6, [src], pstep ; load source data |
| pld [src, #23] |
| ldrne r7, [src], pstep |
| pld [src, #23] |
| ldrne r8, [src], pstep |
| pld [src, #23] |
| ldrne lr, [src], pstep |
| |
| bne MBVnext8 |
| |
| add sp, sp, #16 |
| |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| |
| |
| ; Constant Pool |
| c0x80808080 DCD 0x80808080 |
| c0x03030303 DCD 0x03030303 |
| c0x04040404 DCD 0x04040404 |
| c0x01010101 DCD 0x01010101 |
| c0x7F7F7F7F DCD 0x7F7F7F7F |
| |
| END |