| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| EXPORT |vp8_filter_block2d_bil_first_pass_armv6| |
| EXPORT |vp8_filter_block2d_bil_second_pass_armv6| |
| |
| AREA |.text|, CODE, READONLY ; name this block of code |
| |
| ;------------------------------------- |
| ; r0 unsigned char *src_ptr, |
| ; r1 unsigned short *dst_ptr, |
| ; r2 unsigned int src_pitch, |
| ; r3 unsigned int height, |
| ; stack unsigned int width, |
| ; stack const short *vp8_filter |
| ;------------------------------------- |
| ; The output is transposed stroed in output array to make it easy for second pass filtering. |
| |vp8_filter_block2d_bil_first_pass_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; vp8_filter address |
| ldr r4, [sp, #36] ; width |
| |
| mov r12, r3 ; outer-loop counter |
| |
| add r7, r2, r4 ; preload next row |
| pld [r0, r7] |
| |
| sub r2, r2, r4 ; src increment for height loop |
| |
| ldr r5, [r11] ; load up filter coefficients |
| |
| mov r3, r3, lsl #1 ; height*2 |
| add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) |
| |
| mov r11, r1 ; save dst_ptr for each row |
| |
| cmp r5, #128 ; if filter coef = 128, then skip the filter |
| beq bil_null_1st_filter |
| |
| |bil_height_loop_1st_v6| |
| ldrb r6, [r0] ; load source data |
| ldrb r7, [r0, #1] |
| ldrb r8, [r0, #2] |
| mov lr, r4, lsr #2 ; 4-in-parellel loop counter |
| |
| |bil_width_loop_1st_v6| |
| ldrb r9, [r0, #3] |
| ldrb r10, [r0, #4] |
| |
| pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] |
| pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] |
| |
| smuad r6, r6, r5 ; apply the filter |
| pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] |
| smuad r7, r7, r5 |
| pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] |
| |
| smuad r8, r8, r5 |
| smuad r9, r9, r5 |
| |
| add r0, r0, #4 |
| subs lr, lr, #1 |
| |
| add r6, r6, #0x40 ; round_shift_and_clamp |
| add r7, r7, #0x40 |
| usat r6, #16, r6, asr #7 |
| usat r7, #16, r7, asr #7 |
| |
| strh r6, [r1], r3 ; result is transposed and stored |
| |
| add r8, r8, #0x40 ; round_shift_and_clamp |
| strh r7, [r1], r3 |
| add r9, r9, #0x40 |
| usat r8, #16, r8, asr #7 |
| usat r9, #16, r9, asr #7 |
| |
| strh r8, [r1], r3 ; result is transposed and stored |
| |
| ldrneb r6, [r0] ; load source data |
| strh r9, [r1], r3 |
| |
| ldrneb r7, [r0, #1] |
| ldrneb r8, [r0, #2] |
| |
| bne bil_width_loop_1st_v6 |
| |
| add r0, r0, r2 ; move to next input row |
| subs r12, r12, #1 |
| |
| add r9, r2, r4, lsl #1 ; adding back block width |
| pld [r0, r9] ; preload next row |
| |
| add r11, r11, #2 ; move over to next column |
| mov r1, r11 |
| |
| bne bil_height_loop_1st_v6 |
| |
| ldmia sp!, {r4 - r11, pc} |
| |
| |bil_null_1st_filter| |
| |bil_height_loop_null_1st| |
| mov lr, r4, lsr #2 ; loop counter |
| |
| |bil_width_loop_null_1st| |
| ldrb r6, [r0] ; load data |
| ldrb r7, [r0, #1] |
| ldrb r8, [r0, #2] |
| ldrb r9, [r0, #3] |
| |
| strh r6, [r1], r3 ; store it to immediate buffer |
| add r0, r0, #4 |
| strh r7, [r1], r3 |
| subs lr, lr, #1 |
| strh r8, [r1], r3 |
| strh r9, [r1], r3 |
| |
| bne bil_width_loop_null_1st |
| |
| subs r12, r12, #1 |
| add r0, r0, r2 ; move to next input line |
| add r11, r11, #2 ; move over to next column |
| mov r1, r11 |
| |
| bne bil_height_loop_null_1st |
| |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| |
| |
| |
| ;--------------------------------- |
| ; r0 unsigned short *src_ptr, |
| ; r1 unsigned char *dst_ptr, |
| ; r2 int dst_pitch, |
| ; r3 unsigned int height, |
| ; stack unsigned int width, |
| ; stack const short *vp8_filter |
| ;--------------------------------- |
| |vp8_filter_block2d_bil_second_pass_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; vp8_filter address |
| ldr r4, [sp, #36] ; width |
| |
| ldr r5, [r11] ; load up filter coefficients |
| mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix |
| mov r11, r1 |
| |
| cmp r5, #128 ; if filter coef = 128, then skip the filter |
| beq bil_null_2nd_filter |
| |
| |bil_height_loop_2nd| |
| ldr r6, [r0] ; load the data |
| ldr r8, [r0, #4] |
| ldrh r10, [r0, #8] |
| mov lr, r3, lsr #2 ; loop counter |
| |
| |bil_width_loop_2nd| |
| pkhtb r7, r6, r8 ; src[1] | src[2] |
| pkhtb r9, r8, r10 ; src[3] | src[4] |
| |
| smuad r6, r6, r5 ; apply filter |
| smuad r8, r8, r5 ; apply filter |
| |
| subs lr, lr, #1 |
| |
| smuadx r7, r7, r5 ; apply filter |
| smuadx r9, r9, r5 ; apply filter |
| |
| add r0, r0, #8 |
| |
| add r6, r6, #0x40 ; round_shift_and_clamp |
| add r7, r7, #0x40 |
| usat r6, #8, r6, asr #7 |
| usat r7, #8, r7, asr #7 |
| strb r6, [r1], r2 ; the result is transposed back and stored |
| |
| add r8, r8, #0x40 ; round_shift_and_clamp |
| strb r7, [r1], r2 |
| add r9, r9, #0x40 |
| usat r8, #8, r8, asr #7 |
| usat r9, #8, r9, asr #7 |
| strb r8, [r1], r2 ; the result is transposed back and stored |
| |
| ldrne r6, [r0] ; load data |
| strb r9, [r1], r2 |
| ldrne r8, [r0, #4] |
| ldrneh r10, [r0, #8] |
| |
| bne bil_width_loop_2nd |
| |
| subs r12, r12, #1 |
| add r0, r0, #4 ; update src for next row |
| add r11, r11, #1 |
| mov r1, r11 |
| |
| bne bil_height_loop_2nd |
| ldmia sp!, {r4 - r11, pc} |
| |
| |bil_null_2nd_filter| |
| |bil_height_loop_null_2nd| |
| mov lr, r3, lsr #2 |
| |
| |bil_width_loop_null_2nd| |
| ldr r6, [r0], #4 ; load data |
| subs lr, lr, #1 |
| ldr r8, [r0], #4 |
| |
| strb r6, [r1], r2 ; store data |
| mov r7, r6, lsr #16 |
| strb r7, [r1], r2 |
| mov r9, r8, lsr #16 |
| strb r8, [r1], r2 |
| strb r9, [r1], r2 |
| |
| bne bil_width_loop_null_2nd |
| |
| subs r12, r12, #1 |
| add r0, r0, #4 |
| add r11, r11, #1 |
| mov r1, r11 |
| |
| bne bil_height_loop_null_2nd |
| |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_filter_block2d_second_pass_armv6| |
| |
| END |