third_party/libvpx/vpx_dsp/x86/deblock_sse2.asm - cobalt - Git at Google

 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;


 %include "vpx_ports/x86_abi_support.asm"

 ;macro in deblock functions
 %macro FIRST_2_ROWS 0
         movdqa      xmm4,       xmm0
         movdqa      xmm6,       xmm0
         movdqa      xmm5,       xmm1
         pavgb       xmm5,       xmm3

         ;calculate absolute value
         psubusb     xmm4,       xmm1
         psubusb     xmm1,       xmm0
         psubusb     xmm6,       xmm3
         psubusb     xmm3,       xmm0
         paddusb     xmm4,       xmm1
         paddusb     xmm6,       xmm3

         ;get threshold
         movdqa      xmm2,       flimit
         pxor        xmm1,       xmm1
         movdqa      xmm7,       xmm2

         ;get mask
         psubusb     xmm2,       xmm4
         psubusb     xmm7,       xmm6
         pcmpeqb     xmm2,       xmm1
         pcmpeqb     xmm7,       xmm1
         por         xmm7,       xmm2
 %endmacro

 %macro SECOND_2_ROWS 0
         movdqa      xmm6,       xmm0
         movdqa      xmm4,       xmm0
         movdqa      xmm2,       xmm1
         pavgb       xmm1,       xmm3

         ;calculate absolute value
         psubusb     xmm6,       xmm2
         psubusb     xmm2,       xmm0
         psubusb     xmm4,       xmm3
         psubusb     xmm3,       xmm0
         paddusb     xmm6,       xmm2
         paddusb     xmm4,       xmm3

         pavgb       xmm5,       xmm1

         ;get threshold
         movdqa      xmm2,       flimit
         pxor        xmm1,       xmm1
         movdqa      xmm3,       xmm2

         ;get mask
         psubusb     xmm2,       xmm6
         psubusb     xmm3,       xmm4
         pcmpeqb     xmm2,       xmm1
         pcmpeqb     xmm3,       xmm1

         por         xmm7,       xmm2
         por         xmm7,       xmm3

         pavgb       xmm5,       xmm0

         ;decide if or not to use filtered value
         pand        xmm0,       xmm7
         pandn       xmm7,       xmm5
         paddusb     xmm0,       xmm7
 %endmacro

 %macro UPDATE_FLIMIT 0
         movdqu      xmm2,       XMMWORD PTR [rbx]
         movdqu      [rsp],      xmm2
         add         rbx,        16
 %endmacro

 SECTION .text

 ;void vpx_post_proc_down_and_across_mb_row_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned char *dst_ptr,
 ;    int src_pixels_per_line,
 ;    int dst_pixels_per_line,
 ;    int cols,
 ;    int *flimits,
 ;    int size
 ;)
 globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
 sym(vpx_post_proc_down_and_across_mb_row_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
     SAVE_XMM 7
     push        rbx
     push        rsi
     push        rdi
     ; end prolog
     ALIGN_STACK 16, rax
     sub         rsp, 16

         ; put flimit on stack
         mov         rbx,        arg(5)           ;flimits ptr
         UPDATE_FLIMIT

 %define flimit [rsp]

         mov         rsi,        arg(0)           ;src_ptr
         mov         rdi,        arg(1)           ;dst_ptr

         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
         movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
 .nextrow:
         xor         rdx,        rdx              ;col
 .nextcol:
         ;load current and next 2 rows
         movdqu      xmm0,       XMMWORD PTR [rsi]
         movdqu      xmm1,       XMMWORD PTR [rsi + rax]
         movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]

         FIRST_2_ROWS

         ;load above 2 rows
         neg         rax
         movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
         movdqu      xmm3,       XMMWORD PTR [rsi + rax]

         SECOND_2_ROWS

         movdqu      XMMWORD PTR [rdi], xmm0

         neg         rax                          ; positive stride
         add         rsi,        16
         add         rdi,        16

         add         rdx,        16
         cmp         edx,        dword arg(4)     ;cols
         jge         .downdone
         UPDATE_FLIMIT
         jmp         .nextcol

 .downdone:
         ; done with the all cols, start the across filtering in place
         sub         rsi,        rdx
         sub         rdi,        rdx

         mov         rbx,        arg(5) ; flimits
         UPDATE_FLIMIT

         ; dup the first byte into the left border 8 times
         movq        mm1,   [rdi]
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
         punpckldq   mm1,   mm1
         mov         rdx,    -8
         movq        [rdi+rdx], mm1

         ; dup the last byte into the right border
         movsxd      rdx,    dword arg(4)
         movq        mm1,   [rdi + rdx + -1]
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
         punpckldq   mm1,   mm1
         movq        [rdi+rdx], mm1

         xor         rdx,        rdx
         movq        mm0,        QWORD PTR [rdi-16];
         movq        mm1,        QWORD PTR [rdi-8];

 .acrossnextcol:
         movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
         movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
         movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]

         FIRST_2_ROWS

         movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
         movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]

         SECOND_2_ROWS

         movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
         movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
         movdq2q     mm0,        xmm0
         psrldq      xmm0,       8
         movdq2q     mm1,        xmm0

         add         rdx,        16
         cmp         edx,        dword arg(4)     ;cols
         jge         .acrossdone
         UPDATE_FLIMIT
         jmp         .acrossnextcol

 .acrossdone:
         ; last 16 pixels
         movq        QWORD PTR [rdi+rdx-16], mm0

         cmp         edx,        dword arg(4)
         jne         .throw_last_8
         movq        QWORD PTR [rdi+rdx-8], mm1
 .throw_last_8:
         ; done with this rwo
         add         rsi,rax                      ;next src line
         mov         eax, dword arg(3)            ;dst_pixels_per_line
         add         rdi,rax                      ;next destination
         mov         eax, dword arg(2)            ;src_pixels_per_line

         mov         rbx,        arg(5)           ;flimits
         UPDATE_FLIMIT

         dec         rcx                          ;decrement count
         jnz         .nextrow                     ;next row

     add rsp, 16
     pop rsp
     ; begin epilog
     pop rdi
     pop rsi
     pop rbx
     RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
 %undef flimit


 ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
 ;                                    int pitch, int rows, int cols,int flimit)
 globalsym(vpx_mbpost_proc_across_ip_sse2)
 sym(vpx_mbpost_proc_across_ip_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
     SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog

     ALIGN_STACK 16, rax
     sub         rsp, 16

     ; create flimit4 at [rsp]
     mov         eax, dword ptr arg(4) ;flimit
     mov         [rsp], eax
     mov         [rsp+4], eax
     mov         [rsp+8], eax
     mov         [rsp+12], eax
 %define flimit4 [rsp]


     ;for(r=0;r<rows;r++)
 .ip_row_loop:

         xor         rdx,    rdx ;sumsq=0;
         xor         rcx,    rcx ;sum=0;
         mov         rsi,    arg(0); s


         ; dup the first byte into the left border 8 times
         movq        mm1,   [rsi]
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
         punpckldq   mm1,   mm1

         mov         rdi,    -8
         movq        [rsi+rdi], mm1

         ; dup the last byte into the right border
         movsxd      rdx,    dword arg(3)
         movq        mm1,   [rsi + rdx + -1]
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
         punpckldq   mm1,   mm1
         movq        [rsi+rdx], mm1

 .ip_var_loop:
         ;for(i=-8;i<=6;i++)
         ;{
         ;    sumsq += s[i]*s[i];
         ;    sum   += s[i];
         ;}
         movzx       eax, byte [rsi+rdi]
         add         ecx, eax
         mul         al
         add         edx, eax
         add         rdi, 1
         cmp         rdi, 6
         jle         .ip_var_loop


             ;mov         rax,    sumsq
             ;movd        xmm7,   rax
             movd        xmm7,   edx

             ;mov         rax,    sum
             ;movd        xmm6,   rax
             movd        xmm6,   ecx

             mov         rsi,    arg(0) ;s
             xor         rcx,    rcx

             movsxd      rdx,    dword arg(3) ;cols
             add         rdx,    8
             pxor        mm0,    mm0
             pxor        mm1,    mm1

             pxor        xmm0,   xmm0
 .nextcol4:

             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

             punpcklbw   xmm1,   xmm0                    ; expanding
             punpcklbw   xmm2,   xmm0                    ; expanding

             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
             punpcklwd   xmm2,   xmm0                    ; expanding to dwords

             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

             paddd       xmm6,   xmm2
             paddd       xmm7,   xmm1

             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

             paddd       xmm6,   xmm4
             paddd       xmm7,   xmm3

             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

             paddd       xmm7,   xmm3
             paddd       xmm6,   xmm4

             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

             paddd       xmm7,   xmm3
             paddd       xmm6,   xmm4

             movdqa      xmm3,   xmm6
             pmaddwd     xmm3,   xmm3

             movdqa      xmm5,   xmm7
             pslld       xmm5,   4

             psubd       xmm5,   xmm7
             psubd       xmm5,   xmm3

             psubd       xmm5,   flimit4
             psrad       xmm5,   31

             packssdw    xmm5,   xmm0
             packsswb    xmm5,   xmm0

             movd        xmm1,   DWORD PTR [rsi+rcx]
             movq        xmm2,   xmm1

             punpcklbw   xmm1,   xmm0
             punpcklwd   xmm1,   xmm0

             paddd       xmm1,   xmm6
             paddd       xmm1,   [GLOBAL(four8s)]

             psrad       xmm1,   4
             packssdw    xmm1,   xmm0

             packuswb    xmm1,   xmm0
             pand        xmm1,   xmm5

             pandn       xmm5,   xmm2
             por         xmm5,   xmm1

             movd        [rsi+rcx-8],  mm0
             movq        mm0,    mm1

             movdq2q     mm1,    xmm5
             psrldq      xmm7,   12

             psrldq      xmm6,   12
             add         rcx,    4

             cmp         rcx,    rdx
             jl          .nextcol4

         ;s+=pitch;
         movsxd rax, dword arg(1)
         add    arg(0), rax

         sub dword arg(2), 1 ;rows-=1
         cmp dword arg(2), 0
         jg .ip_row_loop

     add         rsp, 16
     pop         rsp

     ; begin epilog
     pop rdi
     pop rsi
     RESTORE_GOT
     RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
 %undef flimit4


 SECTION_RODATA
 align 16
 four8s:
     times 4 dd 8
	;
	; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	;
	; Use of this source code is governed by a BSD-style license
	; that can be found in the LICENSE file in the root of the source
	; tree. An additional intellectual property rights grant can be found
	; in the file PATENTS. All contributing project authors may
	; be found in the AUTHORS file in the root of the source tree.
	;


	%include "vpx_ports/x86_abi_support.asm"

	;macro in deblock functions
	%macro FIRST_2_ROWS 0
	movdqa xmm4, xmm0
	movdqa xmm6, xmm0
	movdqa xmm5, xmm1
	pavgb xmm5, xmm3

	;calculate absolute value
	psubusb xmm4, xmm1
	psubusb xmm1, xmm0
	psubusb xmm6, xmm3
	psubusb xmm3, xmm0
	paddusb xmm4, xmm1
	paddusb xmm6, xmm3

	;get threshold
	movdqa xmm2, flimit
	pxor xmm1, xmm1
	movdqa xmm7, xmm2

	;get mask
	psubusb xmm2, xmm4
	psubusb xmm7, xmm6
	pcmpeqb xmm2, xmm1
	pcmpeqb xmm7, xmm1
	por xmm7, xmm2
	%endmacro

	%macro SECOND_2_ROWS 0
	movdqa xmm6, xmm0
	movdqa xmm4, xmm0
	movdqa xmm2, xmm1
	pavgb xmm1, xmm3

	;calculate absolute value
	psubusb xmm6, xmm2
	psubusb xmm2, xmm0
	psubusb xmm4, xmm3
	psubusb xmm3, xmm0
	paddusb xmm6, xmm2
	paddusb xmm4, xmm3

	pavgb xmm5, xmm1

	;get threshold
	movdqa xmm2, flimit
	pxor xmm1, xmm1
	movdqa xmm3, xmm2

	;get mask
	psubusb xmm2, xmm6
	psubusb xmm3, xmm4
	pcmpeqb xmm2, xmm1
	pcmpeqb xmm3, xmm1

	por xmm7, xmm2
	por xmm7, xmm3

	pavgb xmm5, xmm0

	;decide if or not to use filtered value
	pand xmm0, xmm7
	pandn xmm7, xmm5
	paddusb xmm0, xmm7
	%endmacro

	%macro UPDATE_FLIMIT 0
	movdqu xmm2, XMMWORD PTR [rbx]
	movdqu [rsp], xmm2
	add rbx, 16
	%endmacro

	SECTION .text

	;void vpx_post_proc_down_and_across_mb_row_sse2
	;(
	; unsigned char *src_ptr,
	; unsigned char *dst_ptr,
	; int src_pixels_per_line,
	; int dst_pixels_per_line,
	; int cols,
	; int *flimits,
	; int size
	;)
	globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
	sym(vpx_post_proc_down_and_across_mb_row_sse2):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 7
	SAVE_XMM 7
	push rbx
	push rsi
	push rdi
	; end prolog
	ALIGN_STACK 16, rax
	sub rsp, 16

	; put flimit on stack
	mov rbx, arg(5) ;flimits ptr
	UPDATE_FLIMIT

	%define flimit [rsp]

	mov rsi, arg(0) ;src_ptr
	mov rdi, arg(1) ;dst_ptr

	movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
	movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
	.nextrow:
	xor rdx, rdx ;col
	.nextcol:
	;load current and next 2 rows
	movdqu xmm0, XMMWORD PTR [rsi]
	movdqu xmm1, XMMWORD PTR [rsi + rax]
	movdqu xmm3, XMMWORD PTR [rsi + 2*rax]

	FIRST_2_ROWS

	;load above 2 rows
	neg rax
	movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
	movdqu xmm3, XMMWORD PTR [rsi + rax]

	SECOND_2_ROWS

	movdqu XMMWORD PTR [rdi], xmm0

	neg rax ; positive stride
	add rsi, 16
	add rdi, 16

	add rdx, 16
	cmp edx, dword arg(4) ;cols
	jge .downdone
	UPDATE_FLIMIT
	jmp .nextcol

	.downdone:
	; done with the all cols, start the across filtering in place
	sub rsi, rdx
	sub rdi, rdx

	mov rbx, arg(5) ; flimits
	UPDATE_FLIMIT

	; dup the first byte into the left border 8 times
	movq mm1, [rdi]
	punpcklbw mm1, mm1
	punpcklwd mm1, mm1
	punpckldq mm1, mm1
	mov rdx, -8
	movq [rdi+rdx], mm1

	; dup the last byte into the right border
	movsxd rdx, dword arg(4)
	movq mm1, [rdi + rdx + -1]
	punpcklbw mm1, mm1
	punpcklwd mm1, mm1
	punpckldq mm1, mm1
	movq [rdi+rdx], mm1

	xor rdx, rdx
	movq mm0, QWORD PTR [rdi-16];
	movq mm1, QWORD PTR [rdi-8];

	.acrossnextcol:
	movdqu xmm0, XMMWORD PTR [rdi + rdx]
	movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
	movdqu xmm3, XMMWORD PTR [rdi + rdx -1]

	FIRST_2_ROWS

	movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
	movdqu xmm3, XMMWORD PTR [rdi + rdx +2]

	SECOND_2_ROWS

	movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
	movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
	movdq2q mm0, xmm0
	psrldq xmm0, 8
	movdq2q mm1, xmm0

	add rdx, 16
	cmp edx, dword arg(4) ;cols
	jge .acrossdone
	UPDATE_FLIMIT
	jmp .acrossnextcol

	.acrossdone:
	; last 16 pixels
	movq QWORD PTR [rdi+rdx-16], mm0

	cmp edx, dword arg(4)
	jne .throw_last_8
	movq QWORD PTR [rdi+rdx-8], mm1
	.throw_last_8:
	; done with this rwo
	add rsi,rax ;next src line
	mov eax, dword arg(3) ;dst_pixels_per_line
	add rdi,rax ;next destination
	mov eax, dword arg(2) ;src_pixels_per_line

	mov rbx, arg(5) ;flimits
	UPDATE_FLIMIT

	dec rcx ;decrement count
	jnz .nextrow ;next row

	add rsp, 16
	pop rsp
	; begin epilog
	pop rdi
	pop rsi
	pop rbx
	RESTORE_XMM
	UNSHADOW_ARGS
	pop rbp
	ret
	%undef flimit


	;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
	; int pitch, int rows, int cols,int flimit)
	globalsym(vpx_mbpost_proc_across_ip_sse2)
	sym(vpx_mbpost_proc_across_ip_sse2):
	push rbp
	mov rbp, rsp
	SHADOW_ARGS_TO_STACK 5
	SAVE_XMM 7
	GET_GOT rbx
	push rsi
	push rdi
	; end prolog

	ALIGN_STACK 16, rax
	sub rsp, 16

	; create flimit4 at [rsp]
	mov eax, dword ptr arg(4) ;flimit
	mov [rsp], eax
	mov [rsp+4], eax
	mov [rsp+8], eax
	mov [rsp+12], eax
	%define flimit4 [rsp]


	;for(r=0;r<rows;r++)
	.ip_row_loop:

	xor rdx, rdx ;sumsq=0;
	xor rcx, rcx ;sum=0;
	mov rsi, arg(0); s


	; dup the first byte into the left border 8 times
	movq mm1, [rsi]
	punpcklbw mm1, mm1
	punpcklwd mm1, mm1
	punpckldq mm1, mm1

	mov rdi, -8
	movq [rsi+rdi], mm1

	; dup the last byte into the right border
	movsxd rdx, dword arg(3)
	movq mm1, [rsi + rdx + -1]
	punpcklbw mm1, mm1
	punpcklwd mm1, mm1
	punpckldq mm1, mm1
	movq [rsi+rdx], mm1

	.ip_var_loop:
	;for(i=-8;i<=6;i++)
	;{
	; sumsq += s[i]*s[i];
	; sum += s[i];
	;}
	movzx eax, byte [rsi+rdi]
	add ecx, eax
	mul al
	add edx, eax
	add rdi, 1
	cmp rdi, 6
	jle .ip_var_loop


	;mov rax, sumsq
	;movd xmm7, rax
	movd xmm7, edx

	;mov rax, sum
	;movd xmm6, rax
	movd xmm6, ecx

	mov rsi, arg(0) ;s
	xor rcx, rcx

	movsxd rdx, dword arg(3) ;cols
	add rdx, 8
	pxor mm0, mm0
	pxor mm1, mm1

	pxor xmm0, xmm0
	.nextcol4:

	movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
	movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10

	punpcklbw xmm1, xmm0 ; expanding
	punpcklbw xmm2, xmm0 ; expanding

	punpcklwd xmm1, xmm0 ; expanding to dwords
	punpcklwd xmm2, xmm0 ; expanding to dwords

	psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
	paddd xmm1, xmm1 ; -82 -72 -62 -52

	paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
	pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5

	paddd xmm6, xmm2
	paddd xmm7, xmm1

	pshufd xmm6, xmm6, 0 ; duplicate the last ones
	pshufd xmm7, xmm7, 0 ; duplicate the last ones

	psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
	psrldq xmm2, 4 ; 8--7 9--6 10--5 0000

	pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
	pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared

	paddd xmm6, xmm4
	paddd xmm7, xmm3

	pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
	pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared

	paddd xmm7, xmm3
	paddd xmm6, xmm4

	pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
	pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared

	paddd xmm7, xmm3
	paddd xmm6, xmm4

	movdqa xmm3, xmm6
	pmaddwd xmm3, xmm3

	movdqa xmm5, xmm7
	pslld xmm5, 4

	psubd xmm5, xmm7
	psubd xmm5, xmm3

	psubd xmm5, flimit4
	psrad xmm5, 31

	packssdw xmm5, xmm0
	packsswb xmm5, xmm0

	movd xmm1, DWORD PTR [rsi+rcx]
	movq xmm2, xmm1

	punpcklbw xmm1, xmm0
	punpcklwd xmm1, xmm0

	paddd xmm1, xmm6
	paddd xmm1, [GLOBAL(four8s)]

	psrad xmm1, 4
	packssdw xmm1, xmm0

	packuswb xmm1, xmm0
	pand xmm1, xmm5

	pandn xmm5, xmm2
	por xmm5, xmm1

	movd [rsi+rcx-8], mm0
	movq mm0, mm1

	movdq2q mm1, xmm5
	psrldq xmm7, 12

	psrldq xmm6, 12
	add rcx, 4

	cmp rcx, rdx
	jl .nextcol4

	;s+=pitch;
	movsxd rax, dword arg(1)
	add arg(0), rax

	sub dword arg(2), 1 ;rows-=1
	cmp dword arg(2), 0
	jg .ip_row_loop

	add rsp, 16
	pop rsp

	; begin epilog
	pop rdi
	pop rsi
	RESTORE_GOT
	RESTORE_XMM
	UNSHADOW_ARGS
	pop rbp
	ret
	%undef flimit4


	SECTION_RODATA
	align 16
	four8s:
	times 4 dd 8