| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| EXPORT |vp8_filter_block2d_first_pass_armv6| |
| EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| |
| EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| |
| EXPORT |vp8_filter_block2d_second_pass_armv6| |
| EXPORT |vp8_filter4_block2d_second_pass_armv6| |
| EXPORT |vp8_filter_block2d_first_pass_only_armv6| |
| EXPORT |vp8_filter_block2d_second_pass_only_armv6| |
| |
| AREA |.text|, CODE, READONLY ; name this block of code |
| ;------------------------------------- |
| ; r0 unsigned char *src_ptr |
| ; r1 short *output_ptr |
| ; r2 unsigned int src_pixels_per_line |
| ; r3 unsigned int output_width |
| ; stack unsigned int output_height |
| ; stack const short *vp8_filter |
| ;------------------------------------- |
| ; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with |
| ; the output being a 2 byte value and the intput being a 1 byte value. |
| |vp8_filter_block2d_first_pass_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; vp8_filter address |
| ldr r7, [sp, #36] ; output height |
| |
| sub r2, r2, r3 ; inside loop increments input array, |
| ; so the height loop only needs to add |
| ; r2 - width to the input pointer |
| |
| mov r3, r3, lsl #1 ; multiply width by 2 because using shorts |
| add r12, r3, #16 ; square off the output |
| sub sp, sp, #4 |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| str r1, [sp] ; push destination to stack |
| mov r7, r7, lsl #16 ; height is top part of counter |
| |
| ; six tap filter |
| |height_loop_1st_6| |
| ldrb r8, [r0, #-2] ; load source data |
| ldrb r9, [r0, #-1] |
| ldrb r10, [r0], #2 |
| orr r7, r7, r3, lsr #2 ; construct loop counter |
| |
| |width_loop_1st_6| |
| ldrb r11, [r0, #-1] |
| |
| pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
| |
| ldrb r9, [r0] |
| |
| smuad lr, lr, r4 ; apply the filter |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| smuad r8, r8, r4 |
| pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
| |
| smlad lr, r10, r5, lr |
| ldrb r10, [r0, #1] |
| smlad r8, r11, r5, r8 |
| ldrb r11, [r0, #2] |
| |
| sub r7, r7, #1 |
| |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| |
| smlad lr, r9, r6, lr |
| smlad r11, r10, r6, r8 |
| |
| ands r10, r7, #0xff ; test loop counter |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| ldrneb r8, [r0, #-2] ; load data for next loop |
| usat lr, #8, lr, asr #7 |
| add r11, r11, #0x40 |
| ldrneb r9, [r0, #-1] |
| usat r11, #8, r11, asr #7 |
| |
| strh lr, [r1], r12 ; result is transposed and stored, which |
| ; will make second pass filtering easier. |
| ldrneb r10, [r0], #2 |
| strh r11, [r1], r12 |
| |
| bne width_loop_1st_6 |
| |
| ldr r1, [sp] ; load and update dst address |
| subs r7, r7, #0x10000 |
| add r0, r0, r2 ; move to next input line |
| |
| add r1, r1, #2 ; move over to next column |
| str r1, [sp] |
| |
| bne height_loop_1st_6 |
| |
| add sp, sp, #4 |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ; -------------------------- |
| ; 16x16 version |
| ; ----------------------------- |
| |vp8_filter_block2d_first_pass_16x16_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; vp8_filter address |
| ldr r7, [sp, #36] ; output height |
| |
| add r4, r2, #18 ; preload next low |
| pld [r0, r4] |
| |
| sub r2, r2, r3 ; inside loop increments input array, |
| ; so the height loop only needs to add |
| ; r2 - width to the input pointer |
| |
| mov r3, r3, lsl #1 ; multiply width by 2 because using shorts |
| add r12, r3, #16 ; square off the output |
| sub sp, sp, #4 |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| str r1, [sp] ; push destination to stack |
| mov r7, r7, lsl #16 ; height is top part of counter |
| |
| ; six tap filter |
| |height_loop_1st_16_6| |
| ldrb r8, [r0, #-2] ; load source data |
| ldrb r9, [r0, #-1] |
| ldrb r10, [r0], #2 |
| orr r7, r7, r3, lsr #2 ; construct loop counter |
| |
| |width_loop_1st_16_6| |
| ldrb r11, [r0, #-1] |
| |
| pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
| |
| ldrb r9, [r0] |
| |
| smuad lr, lr, r4 ; apply the filter |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| smuad r8, r8, r4 |
| pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
| |
| smlad lr, r10, r5, lr |
| ldrb r10, [r0, #1] |
| smlad r8, r11, r5, r8 |
| ldrb r11, [r0, #2] |
| |
| sub r7, r7, #1 |
| |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| |
| smlad lr, r9, r6, lr |
| smlad r11, r10, r6, r8 |
| |
| ands r10, r7, #0xff ; test loop counter |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| ldrneb r8, [r0, #-2] ; load data for next loop |
| usat lr, #8, lr, asr #7 |
| add r11, r11, #0x40 |
| ldrneb r9, [r0, #-1] |
| usat r11, #8, r11, asr #7 |
| |
| strh lr, [r1], r12 ; result is transposed and stored, which |
| ; will make second pass filtering easier. |
| ldrneb r10, [r0], #2 |
| strh r11, [r1], r12 |
| |
| bne width_loop_1st_16_6 |
| |
| ldr r1, [sp] ; load and update dst address |
| subs r7, r7, #0x10000 |
| add r0, r0, r2 ; move to next input line |
| |
| add r11, r2, #34 ; adding back block width(=16) |
| pld [r0, r11] ; preload next low |
| |
| add r1, r1, #2 ; move over to next column |
| str r1, [sp] |
| |
| bne height_loop_1st_16_6 |
| |
| add sp, sp, #4 |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ; -------------------------- |
| ; 8x8 version |
| ; ----------------------------- |
| |vp8_filter_block2d_first_pass_8x8_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; vp8_filter address |
| ldr r7, [sp, #36] ; output height |
| |
| add r4, r2, #10 ; preload next low |
| pld [r0, r4] |
| |
| sub r2, r2, r3 ; inside loop increments input array, |
| ; so the height loop only needs to add |
| ; r2 - width to the input pointer |
| |
| mov r3, r3, lsl #1 ; multiply width by 2 because using shorts |
| add r12, r3, #16 ; square off the output |
| sub sp, sp, #4 |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| str r1, [sp] ; push destination to stack |
| mov r7, r7, lsl #16 ; height is top part of counter |
| |
| ; six tap filter |
| |height_loop_1st_8_6| |
| ldrb r8, [r0, #-2] ; load source data |
| ldrb r9, [r0, #-1] |
| ldrb r10, [r0], #2 |
| orr r7, r7, r3, lsr #2 ; construct loop counter |
| |
| |width_loop_1st_8_6| |
| ldrb r11, [r0, #-1] |
| |
| pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
| |
| ldrb r9, [r0] |
| |
| smuad lr, lr, r4 ; apply the filter |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| smuad r8, r8, r4 |
| pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
| |
| smlad lr, r10, r5, lr |
| ldrb r10, [r0, #1] |
| smlad r8, r11, r5, r8 |
| ldrb r11, [r0, #2] |
| |
| sub r7, r7, #1 |
| |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| |
| smlad lr, r9, r6, lr |
| smlad r11, r10, r6, r8 |
| |
| ands r10, r7, #0xff ; test loop counter |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| ldrneb r8, [r0, #-2] ; load data for next loop |
| usat lr, #8, lr, asr #7 |
| add r11, r11, #0x40 |
| ldrneb r9, [r0, #-1] |
| usat r11, #8, r11, asr #7 |
| |
| strh lr, [r1], r12 ; result is transposed and stored, which |
| ; will make second pass filtering easier. |
| ldrneb r10, [r0], #2 |
| strh r11, [r1], r12 |
| |
| bne width_loop_1st_8_6 |
| |
| ldr r1, [sp] ; load and update dst address |
| subs r7, r7, #0x10000 |
| add r0, r0, r2 ; move to next input line |
| |
| add r11, r2, #18 ; adding back block width(=8) |
| pld [r0, r11] ; preload next low |
| |
| add r1, r1, #2 ; move over to next column |
| str r1, [sp] |
| |
| bne height_loop_1st_8_6 |
| |
| add sp, sp, #4 |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ;--------------------------------- |
| ; r0 short *src_ptr, |
| ; r1 unsigned char *output_ptr, |
| ; r2 unsigned int output_pitch, |
| ; r3 unsigned int cnt, |
| ; stack const short *vp8_filter |
| ;--------------------------------- |
| |vp8_filter_block2d_second_pass_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #36] ; vp8_filter address |
| sub sp, sp, #4 |
| mov r7, r3, lsl #16 ; height is top part of counter |
| str r1, [sp] ; push destination to stack |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| pkhbt r12, r5, r4 ; pack the filter differently |
| pkhbt r11, r6, r5 |
| |
| sub r0, r0, #4 ; offset input buffer |
| |
| |height_loop_2nd| |
| ldr r8, [r0] ; load the data |
| ldr r9, [r0, #4] |
| orr r7, r7, r3, lsr #1 ; loop counter |
| |
| |width_loop_2nd| |
| smuad lr, r4, r8 ; apply filter |
| sub r7, r7, #1 |
| smulbt r8, r4, r8 |
| |
| ldr r10, [r0, #8] |
| |
| smlad lr, r5, r9, lr |
| smladx r8, r12, r9, r8 |
| |
| ldrh r9, [r0, #12] |
| |
| smlad lr, r6, r10, lr |
| smladx r8, r11, r10, r8 |
| |
| add r0, r0, #4 |
| smlatb r10, r6, r9, r8 |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| ands r8, r7, #0xff |
| usat lr, #8, lr, asr #7 |
| add r10, r10, #0x40 |
| strb lr, [r1], r2 ; the result is transposed back and stored |
| usat r10, #8, r10, asr #7 |
| |
| ldrne r8, [r0] ; load data for next loop |
| ldrne r9, [r0, #4] |
| strb r10, [r1], r2 |
| |
| bne width_loop_2nd |
| |
| ldr r1, [sp] ; update dst for next loop |
| subs r7, r7, #0x10000 |
| add r0, r0, #16 ; updata src for next loop |
| add r1, r1, #1 |
| str r1, [sp] |
| |
| bne height_loop_2nd |
| |
| add sp, sp, #4 |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ;--------------------------------- |
| ; r0 short *src_ptr, |
| ; r1 unsigned char *output_ptr, |
| ; r2 unsigned int output_pitch, |
| ; r3 unsigned int cnt, |
| ; stack const short *vp8_filter |
| ;--------------------------------- |
| |vp8_filter4_block2d_second_pass_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #36] ; vp8_filter address |
| mov r7, r3, lsl #16 ; height is top part of counter |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| add lr, r1, r3 ; save final destination pointer |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| pkhbt r12, r5, r4 ; pack the filter differently |
| pkhbt r11, r6, r5 |
| mov r4, #0x40 ; rounding factor (for smlad{x}) |
| |
| |height_loop_2nd_4| |
| ldrd r8, r9, [r0, #-4] ; load the data |
| orr r7, r7, r3, lsr #1 ; loop counter |
| |
| |width_loop_2nd_4| |
| ldr r10, [r0, #4]! |
| smladx r6, r9, r12, r4 ; apply filter |
| pkhbt r8, r9, r8 |
| smlad r5, r8, r12, r4 |
| pkhbt r8, r10, r9 |
| smladx r6, r10, r11, r6 |
| sub r7, r7, #1 |
| smlad r5, r8, r11, r5 |
| |
| mov r8, r9 ; shift the data for the next loop |
| mov r9, r10 |
| |
| usat r6, #8, r6, asr #7 ; shift and clamp |
| usat r5, #8, r5, asr #7 |
| |
| strb r5, [r1], r2 ; the result is transposed back and stored |
| tst r7, #0xff |
| strb r6, [r1], r2 |
| |
| bne width_loop_2nd_4 |
| |
| subs r7, r7, #0x10000 |
| add r0, r0, #16 ; update src for next loop |
| sub r1, lr, r7, lsr #16 ; update dst for next loop |
| |
| bne height_loop_2nd_4 |
| |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ;------------------------------------ |
| ; r0 unsigned char *src_ptr |
| ; r1 unsigned char *output_ptr, |
| ; r2 unsigned int src_pixels_per_line |
| ; r3 unsigned int cnt, |
| ; stack unsigned int output_pitch, |
| ; stack const short *vp8_filter |
| ;------------------------------------ |
| |vp8_filter_block2d_first_pass_only_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| add r7, r2, r3 ; preload next low |
| add r7, r7, #2 |
| pld [r0, r7] |
| |
| ldr r4, [sp, #36] ; output pitch |
| ldr r11, [sp, #40] ; HFilter address |
| sub sp, sp, #8 |
| |
| mov r7, r3 |
| sub r2, r2, r3 ; inside loop increments input array, |
| ; so the height loop only needs to add |
| ; r2 - width to the input pointer |
| |
| sub r4, r4, r3 |
| str r4, [sp] ; save modified output pitch |
| str r2, [sp, #4] |
| |
| mov r2, #0x40 |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| ; six tap filter |
| |height_loop_1st_only_6| |
| ldrb r8, [r0, #-2] ; load data |
| ldrb r9, [r0, #-1] |
| ldrb r10, [r0], #2 |
| |
| mov r12, r3, lsr #1 ; loop counter |
| |
| |width_loop_1st_only_6| |
| ldrb r11, [r0, #-1] |
| |
| pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
| |
| ldrb r9, [r0] |
| |
| ;; smuad lr, lr, r4 |
| smlad lr, lr, r4, r2 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| ;; smuad r8, r8, r4 |
| smlad r8, r8, r4, r2 |
| pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
| |
| smlad lr, r10, r5, lr |
| ldrb r10, [r0, #1] |
| smlad r8, r11, r5, r8 |
| ldrb r11, [r0, #2] |
| |
| subs r12, r12, #1 |
| |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| |
| smlad lr, r9, r6, lr |
| smlad r10, r10, r6, r8 |
| |
| ;; add lr, lr, #0x40 ; round_shift_and_clamp |
| ldrneb r8, [r0, #-2] ; load data for next loop |
| usat lr, #8, lr, asr #7 |
| ;; add r10, r10, #0x40 |
| strb lr, [r1], #1 ; store the result |
| usat r10, #8, r10, asr #7 |
| |
| ldrneb r9, [r0, #-1] |
| strb r10, [r1], #1 |
| ldrneb r10, [r0], #2 |
| |
| bne width_loop_1st_only_6 |
| |
| ldr lr, [sp] ; load back output pitch |
| ldr r12, [sp, #4] ; load back output pitch |
| subs r7, r7, #1 |
| add r0, r0, r12 ; updata src for next loop |
| |
| add r11, r12, r3 ; preload next low |
| add r11, r11, #2 |
| pld [r0, r11] |
| |
| add r1, r1, lr ; update dst for next loop |
| |
| bne height_loop_1st_only_6 |
| |
| add sp, sp, #8 |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_filter_block2d_first_pass_only_armv6| |
| |
| |
| ;------------------------------------ |
| ; r0 unsigned char *src_ptr, |
| ; r1 unsigned char *output_ptr, |
| ; r2 unsigned int src_pixels_per_line |
| ; r3 unsigned int cnt, |
| ; stack unsigned int output_pitch, |
| ; stack const short *vp8_filter |
| ;------------------------------------ |
| |vp8_filter_block2d_second_pass_only_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| |
| ldr r11, [sp, #40] ; VFilter address |
| ldr r12, [sp, #36] ; output pitch |
| |
| mov r7, r3, lsl #16 ; height is top part of counter |
| sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after |
| |
| sub sp, sp, #8 |
| |
| ldr r4, [r11] ; load up packed filter coefficients |
| ldr r5, [r11, #4] |
| ldr r6, [r11, #8] |
| |
| str r0, [sp] ; save r0 to stack |
| str r1, [sp, #4] ; save dst to stack |
| |
| ; six tap filter |
| |width_loop_2nd_only_6| |
| ldrb r8, [r0], r2 ; load data |
| orr r7, r7, r3 ; loop counter |
| ldrb r9, [r0], r2 |
| ldrb r10, [r0], r2 |
| |
| |height_loop_2nd_only_6| |
| ; filter first column in this inner loop, than, move to next colum. |
| ldrb r11, [r0], r2 |
| |
| pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
| |
| ldrb r9, [r0], r2 |
| |
| smuad lr, lr, r4 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| smuad r8, r8, r4 |
| pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
| |
| smlad lr, r10, r5, lr |
| ldrb r10, [r0], r2 |
| smlad r8, r11, r5, r8 |
| ldrb r11, [r0] |
| |
| sub r7, r7, #2 |
| sub r0, r0, r2, lsl #2 |
| |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
| |
| smlad lr, r9, r6, lr |
| smlad r10, r10, r6, r8 |
| |
| ands r9, r7, #0xff |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| ldrneb r8, [r0], r2 ; load data for next loop |
| usat lr, #8, lr, asr #7 |
| add r10, r10, #0x40 |
| strb lr, [r1], r12 ; store the result for the column |
| usat r10, #8, r10, asr #7 |
| |
| ldrneb r9, [r0], r2 |
| strb r10, [r1], r12 |
| ldrneb r10, [r0], r2 |
| |
| bne height_loop_2nd_only_6 |
| |
| ldr r0, [sp] |
| ldr r1, [sp, #4] |
| subs r7, r7, #0x10000 |
| add r0, r0, #1 ; move to filter next column |
| str r0, [sp] |
| add r1, r1, #1 |
| str r1, [sp, #4] |
| |
| bne width_loop_2nd_only_6 |
| |
| add sp, sp, #8 |
| |
| ldmia sp!, {r4 - r11, pc} |
| ENDP ; |vp8_filter_block2d_second_pass_only_armv6| |
| |
| END |