| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| EXPORT |vp8_short_idct4x4llm_v6_dual| |
| |
| AREA |.text|, CODE, READONLY |
| |
| |
| ; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, |
| ; unsigned char *dst, int stride) |
| ; r0 short* input |
| ; r1 unsigned char* pred |
| ; r2 int pitch |
| ; r3 unsigned char* dst |
| ; sp int stride |
| |
| |vp8_short_idct4x4llm_v6_dual| PROC |
| stmdb sp!, {r4-r11, lr} |
| |
| sub sp, sp, #4 |
| |
| mov r4, #0x00008A00 ; sin |
| orr r4, r4, #0x0000008C ; sinpi8sqrt2 |
| |
| mov r5, #0x00004E00 ; cos |
| orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 |
| orr r5, r5, #1<<31 ; loop counter on top bit |
| |
| loop1_dual |
| ldr r6, [r0, #(4*2)] ; i5 | i4 |
| ldr r12, [r0, #(12*2)] ; i13|i12 |
| ldr r14, [r0, #(8*2)] ; i9 | i8 |
| |
| smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 |
| smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 |
| smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 |
| smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 |
| |
| smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 |
| pkhtb r7, r9, r7, asr #16 ; 5c | 4c |
| pkhbt r8, r8, r10, lsl #16 ; 5s | 4s |
| uadd16 r6, r6, r7 ; 5c+5 | 4c+4 |
| |
| smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 |
| smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 |
| smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 |
| |
| subs r5, r5, #1<<31 ; i-- |
| |
| pkhtb r9, r11, r9, asr #16 ; 13c | 12c |
| ldr r11, [r0] ; i1 | i0 |
| pkhbt r10, r10, r7, lsl #16 ; 13s | 12s |
| uadd16 r7, r12, r9 ; 13c+13 | 12c+12 |
| |
| usub16 r7, r8, r7 ; c |
| uadd16 r6, r6, r10 ; d |
| uadd16 r10, r11, r14 ; a |
| usub16 r8, r11, r14 ; b |
| |
| uadd16 r9, r10, r6 ; a+d |
| usub16 r10, r10, r6 ; a-d |
| uadd16 r6, r8, r7 ; b+c |
| usub16 r7, r8, r7 ; b-c |
| |
| ; use input buffer to store intermediate results |
| str r6, [r0, #(4*2)] ; o5 | o4 |
| str r7, [r0, #(8*2)] ; o9 | o8 |
| str r10,[r0, #(12*2)] ; o13|o12 |
| str r9, [r0], #4 ; o1 | o0 |
| |
| bcs loop1_dual |
| |
| sub r0, r0, #8 ; reset input/output |
| str r0, [sp] |
| |
| loop2_dual |
| |
| ldr r6, [r0, #(4*2)] ; i5 | i4 |
| ldr r12,[r0, #(2*2)] ; i3 | i2 |
| ldr r14,[r0, #(6*2)] ; i7 | i6 |
| ldr r0, [r0, #(0*2)] ; i1 | i0 |
| |
| smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 |
| smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 |
| smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 |
| smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 |
| |
| pkhbt r11, r6, r0, lsl #16 ; i0 | i4 |
| pkhtb r7, r7, r9, asr #16 ; 1c | 5c |
| pkhtb r0, r0, r6, asr #16 ; i1 | i5 |
| pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 |
| |
| uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 |
| pkhbt r9, r14, r12, lsl #16 ; i2 | i6 |
| uadd16 r10, r11, r9 ; a |
| usub16 r9, r11, r9 ; b |
| pkhtb r6, r12, r14, asr #16 ; i3 | i7 |
| |
| subs r5, r5, #1<<31 ; i-- |
| |
| smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 |
| smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 |
| smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 |
| smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 |
| |
| pkhtb r7, r7, r12, asr #16 ; 3c | 7c |
| pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 |
| |
| uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 |
| usub16 r12, r8, r6 ; c (o1 | o5) |
| uadd16 r6, r11, r0 ; d (o3 | o7) |
| uadd16 r7, r10, r6 ; a+d |
| |
| mov r8, #4 ; set up 4's |
| orr r8, r8, #0x40000 ; 4|4 |
| |
| usub16 r6, r10, r6 ; a-d |
| uadd16 r6, r6, r8 ; a-d+4, 3|7 |
| uadd16 r7, r7, r8 ; a+d+4, 0|4 |
| uadd16 r10, r9, r12 ; b+c |
| usub16 r0, r9, r12 ; b-c |
| uadd16 r10, r10, r8 ; b+c+4, 1|5 |
| uadd16 r8, r0, r8 ; b-c+4, 2|6 |
| |
| ldr lr, [sp, #40] ; dst stride |
| |
| ldrb r0, [r1] ; pred p0 |
| ldrb r11, [r1, #1] ; pred p1 |
| ldrb r12, [r1, #2] ; pred p2 |
| |
| add r0, r0, r7, asr #19 ; p0 + o0 |
| add r11, r11, r10, asr #19 ; p1 + o1 |
| add r12, r12, r8, asr #19 ; p2 + o2 |
| |
| usat r0, #8, r0 ; d0 = clip8(p0 + o0) |
| usat r11, #8, r11 ; d1 = clip8(p1 + o1) |
| usat r12, #8, r12 ; d2 = clip8(p2 + o2) |
| |
| add r0, r0, r11, lsl #8 ; |--|--|d1|d0| |
| |
| ldrb r11, [r1, #3] ; pred p3 |
| |
| add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| |
| |
| add r11, r11, r6, asr #19 ; p3 + o3 |
| |
| sxth r7, r7 ; |
| sxth r10, r10 ; |
| |
| usat r11, #8, r11 ; d3 = clip8(p3 + o3) |
| |
| sxth r8, r8 ; |
| sxth r6, r6 ; |
| |
| add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| |
| |
| ldrb r12, [r1, r2]! ; pred p4 |
| str r0, [r3], lr |
| ldrb r11, [r1, #1] ; pred p5 |
| |
| add r12, r12, r7, asr #3 ; p4 + o4 |
| add r11, r11, r10, asr #3 ; p5 + o5 |
| |
| usat r12, #8, r12 ; d4 = clip8(p4 + o4) |
| usat r11, #8, r11 ; d5 = clip8(p5 + o5) |
| |
| ldrb r7, [r1, #2] ; pred p6 |
| ldrb r10, [r1, #3] ; pred p6 |
| |
| add r12, r12, r11, lsl #8 ; |--|--|d5|d4| |
| |
| add r7, r7, r8, asr #3 ; p6 + o6 |
| add r10, r10, r6, asr #3 ; p7 + o7 |
| |
| ldr r0, [sp] ; load input pointer |
| |
| usat r7, #8, r7 ; d6 = clip8(p6 + o6) |
| usat r10, #8, r10 ; d7 = clip8(p7 + o7) |
| |
| add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| |
| add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| |
| |
| str r12, [r3], lr |
| add r0, r0, #16 |
| add r1, r1, r2 ; pred + pitch |
| |
| bcs loop2_dual |
| |
| add sp, sp, #4 ; idct_output buffer |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| END |