| ; |
| ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) |
| ; |
| ; Copyright (C) 2016, 2018, Matthieu Darbois |
| ; |
| ; Based on the x86 SIMD extension for IJG JPEG library |
| ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| ; |
| ; This file should be assembled with NASM (Netwide Assembler), |
| ; can *not* be assembled with Microsoft's MASM or any compatible |
| ; assembler (including Borland's Turbo Assembler). |
| ; NASM is available from http://nasm.sourceforge.net/ or |
| ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| ; |
| ; This file contains an SSE2 implementation of data preparation for progressive |
| ; Huffman encoding. See jcphuff.c for more details. |
| ; |
| ; [TAB8] |
| |
| %include "jsimdext.inc" |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_TEXT |
| BITS 32 |
| |
| ; -------------------------------------------------------------------------- |
| ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and |
| ; jsimd_encode_mcu_AC_refine_prepare_sse2() |
| |
| %macro LOAD16 0 |
| pxor N0, N0 |
| pxor N1, N1 |
| |
| mov T0, INT [LUT + 0*SIZEOF_INT] |
| mov T1, INT [LUT + 8*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 0 |
| pinsrw X1, word [BLOCK + T1 * 2], 0 |
| |
| mov T0, INT [LUT + 1*SIZEOF_INT] |
| mov T1, INT [LUT + 9*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 1 |
| pinsrw X1, word [BLOCK + T1 * 2], 1 |
| |
| mov T0, INT [LUT + 2*SIZEOF_INT] |
| mov T1, INT [LUT + 10*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 2 |
| pinsrw X1, word [BLOCK + T1 * 2], 2 |
| |
| mov T0, INT [LUT + 3*SIZEOF_INT] |
| mov T1, INT [LUT + 11*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 3 |
| pinsrw X1, word [BLOCK + T1 * 2], 3 |
| |
| mov T0, INT [LUT + 4*SIZEOF_INT] |
| mov T1, INT [LUT + 12*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 4 |
| pinsrw X1, word [BLOCK + T1 * 2], 4 |
| |
| mov T0, INT [LUT + 5*SIZEOF_INT] |
| mov T1, INT [LUT + 13*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 5 |
| pinsrw X1, word [BLOCK + T1 * 2], 5 |
| |
| mov T0, INT [LUT + 6*SIZEOF_INT] |
| mov T1, INT [LUT + 14*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 6 |
| pinsrw X1, word [BLOCK + T1 * 2], 6 |
| |
| mov T0, INT [LUT + 7*SIZEOF_INT] |
| mov T1, INT [LUT + 15*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 7 |
| pinsrw X1, word [BLOCK + T1 * 2], 7 |
| %endmacro |
| |
| %macro LOAD15 0 |
| pxor N0, N0 |
| pxor N1, N1 |
| pxor X1, X1 |
| |
| mov T0, INT [LUT + 0*SIZEOF_INT] |
| mov T1, INT [LUT + 8*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 0 |
| pinsrw X1, word [BLOCK + T1 * 2], 0 |
| |
| mov T0, INT [LUT + 1*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 1 |
| |
| mov T0, INT [LUT + 2*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 2 |
| |
| mov T0, INT [LUT + 3*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 3 |
| |
| mov T0, INT [LUT + 4*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 4 |
| |
| mov T0, INT [LUT + 5*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 5 |
| |
| mov T0, INT [LUT + 6*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 6 |
| |
| mov T0, INT [LUT + 7*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 7 |
| |
| cmp LENEND, 2 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 9*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 1 |
| |
| cmp LENEND, 3 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 10*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 2 |
| |
| cmp LENEND, 4 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 11*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 3 |
| |
| cmp LENEND, 5 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 12*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 4 |
| |
| cmp LENEND, 6 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 13*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 5 |
| |
| cmp LENEND, 7 |
| jl %%.ELOAD15 |
| mov T1, INT [LUT + 14*SIZEOF_INT] |
| pinsrw X1, word [BLOCK + T1 * 2], 6 |
| %%.ELOAD15: |
| %endmacro |
| |
| %macro LOAD8 0 |
| pxor N0, N0 |
| |
| mov T0, INT [LUT + 0*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 0 |
| |
| mov T0, INT [LUT + 1*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 1 |
| |
| mov T0, INT [LUT + 2*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 2 |
| |
| mov T0, INT [LUT + 3*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 3 |
| |
| mov T0, INT [LUT + 4*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 4 |
| |
| mov T0, INT [LUT + 5*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 5 |
| |
| mov T0, INT [LUT + 6*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 6 |
| |
| mov T0, INT [LUT + 7*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T0 * 2], 7 |
| %endmacro |
| |
| %macro LOAD7 0 |
| pxor N0, N0 |
| pxor X0, X0 |
| |
| mov T1, INT [LUT + 0*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 0 |
| |
| cmp LENEND, 2 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 1*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 1 |
| |
| cmp LENEND, 3 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 2*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 2 |
| |
| cmp LENEND, 4 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 3*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 3 |
| |
| cmp LENEND, 5 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 4*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 4 |
| |
| cmp LENEND, 6 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 5*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 5 |
| |
| cmp LENEND, 7 |
| jl %%.ELOAD7 |
| mov T1, INT [LUT + 6*SIZEOF_INT] |
| pinsrw X0, word [BLOCK + T1 * 2], 6 |
| %%.ELOAD7: |
| %endmacro |
| |
| %macro REDUCE0 0 |
| movdqa xmm0, XMMWORD [VALUES + ( 0*2)] |
| movdqa xmm1, XMMWORD [VALUES + ( 8*2)] |
| movdqa xmm2, XMMWORD [VALUES + (16*2)] |
| movdqa xmm3, XMMWORD [VALUES + (24*2)] |
| movdqa xmm4, XMMWORD [VALUES + (32*2)] |
| movdqa xmm5, XMMWORD [VALUES + (40*2)] |
| movdqa xmm6, XMMWORD [VALUES + (48*2)] |
| |
| pcmpeqw xmm0, ZERO |
| pcmpeqw xmm1, ZERO |
| pcmpeqw xmm2, ZERO |
| pcmpeqw xmm3, ZERO |
| pcmpeqw xmm4, ZERO |
| pcmpeqw xmm5, ZERO |
| pcmpeqw xmm6, ZERO |
| pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] |
| |
| packsswb xmm0, xmm1 |
| packsswb xmm2, xmm3 |
| packsswb xmm4, xmm5 |
| packsswb xmm6, xmm7 |
| |
| pmovmskb eax, xmm0 |
| pmovmskb ecx, xmm2 |
| pmovmskb edx, xmm4 |
| pmovmskb esi, xmm6 |
| |
| shl ecx, 16 |
| shl esi, 16 |
| |
| or eax, ecx |
| or edx, esi |
| |
| not eax |
| not edx |
| |
| mov edi, ZEROBITS |
| |
| mov INT [edi], eax |
| mov INT [edi+SIZEOF_INT], edx |
| %endmacro |
| |
| ; |
| ; Prepare data for jsimd_encode_mcu_AC_first(). |
| ; |
| ; GLOBAL(void) |
| ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, |
| ; const int *jpeg_natural_order_start, |
| ; int Sl, int Al, JCOEF *values, |
| ; size_t *zerobits) |
| ; |
| ; eax + 8 = const JCOEF *block |
| ; eax + 12 = const int *jpeg_natural_order_start |
| ; eax + 16 = int Sl |
| ; eax + 20 = int Al |
| ; eax + 24 = JCOEF *values |
| ; eax + 28 = size_t *zerobits |
| |
| %define ZERO xmm7 |
| %define X0 xmm0 |
| %define X1 xmm1 |
| %define N0 xmm2 |
| %define N1 xmm3 |
| %define AL xmm4 |
| %define K eax |
| %define LENEND eax |
| %define LUT ebx |
| %define T0 ecx |
| %define T1 edx |
| %define BLOCK esi |
| %define VALUES edi |
| %define LEN ebp |
| |
| %define ZEROBITS INT [esp + 5 * 4] |
| |
| align 32 |
| GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) |
| |
| EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): |
| push ebp |
| mov eax, esp ; eax = original ebp |
| sub esp, byte 4 |
| and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
| mov [esp], eax |
| mov ebp, esp ; ebp = aligned ebp |
| sub esp, 4 |
| push ebx |
| push ecx |
| ; push edx ; need not be preserved |
| push esi |
| push edi |
| push ebp |
| |
| mov BLOCK, INT [eax + 8] |
| mov LUT, INT [eax + 12] |
| mov VALUES, INT [eax + 24] |
| movd AL, INT [eax + 20] |
| mov T0, INT [eax + 28] |
| mov ZEROBITS, T0 |
| mov LEN, INT [eax + 16] |
| pxor ZERO, ZERO |
| mov K, LEN |
| and K, -16 |
| shr K, 4 |
| jz .ELOOP16 |
| .BLOOP16: |
| LOAD16 |
| pcmpgtw N0, X0 |
| pcmpgtw N1, X1 |
| paddw X0, N0 |
| paddw X1, N1 |
| pxor X0, N0 |
| pxor X1, N1 |
| psrlw X0, AL |
| psrlw X1, AL |
| pxor N0, X0 |
| pxor N1, X1 |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (8) * 2], X1 |
| movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 |
| movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 |
| add VALUES, 16*2 |
| add LUT, 16*SIZEOF_INT |
| dec K |
| jnz .BLOOP16 |
| .ELOOP16: |
| mov LENEND, LEN |
| and LENEND, 7 |
| |
| test LEN, 8 |
| jz .TRY7 |
| test LEN, 7 |
| jz .TRY8 |
| |
| LOAD15 |
| pcmpgtw N0, X0 |
| pcmpgtw N1, X1 |
| paddw X0, N0 |
| paddw X1, N1 |
| pxor X0, N0 |
| pxor X1, N1 |
| psrlw X0, AL |
| psrlw X1, AL |
| pxor N0, X0 |
| pxor N1, X1 |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (8) * 2], X1 |
| movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 |
| movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 |
| add VALUES, 16*2 |
| jmp .PADDING |
| .TRY8: |
| LOAD8 |
| pcmpgtw N0, X0 |
| paddw X0, N0 |
| pxor X0, N0 |
| psrlw X0, AL |
| pxor N0, X0 |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 |
| add VALUES, 8*2 |
| jmp .PADDING |
| .TRY7: |
| LOAD7 |
| pcmpgtw N0, X0 |
| paddw X0, N0 |
| pxor X0, N0 |
| psrlw X0, AL |
| pxor N0, X0 |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 |
| add VALUES, 8*2 |
| .PADDING: |
| mov K, LEN |
| add K, 7 |
| and K, -8 |
| shr K, 3 |
| sub K, DCTSIZE2/8 |
| jz .EPADDING |
| align 16 |
| .ZEROLOOP: |
| movdqa XMMWORD [VALUES + 0], ZERO |
| add VALUES, 8*2 |
| inc K |
| jnz .ZEROLOOP |
| .EPADDING: |
| sub VALUES, DCTSIZE2*2 |
| |
| REDUCE0 |
| |
| pop ebp |
| pop edi |
| pop esi |
| ; pop edx ; need not be preserved |
| pop ecx |
| pop ebx |
| mov esp, ebp ; esp <- aligned ebp |
| pop esp ; esp <- original ebp |
| pop ebp |
| ret |
| |
| %undef ZERO |
| %undef X0 |
| %undef X1 |
| %undef N0 |
| %undef N1 |
| %undef AL |
| %undef K |
| %undef LUT |
| %undef T0 |
| %undef T1 |
| %undef BLOCK |
| %undef VALUES |
| %undef LEN |
| |
| ; |
| ; Prepare data for jsimd_encode_mcu_AC_refine(). |
| ; |
| ; GLOBAL(int) |
| ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, |
| ; const int *jpeg_natural_order_start, |
| ; int Sl, int Al, JCOEF *absvalues, |
| ; size_t *bits) |
| ; |
| ; eax + 8 = const JCOEF *block |
| ; eax + 12 = const int *jpeg_natural_order_start |
| ; eax + 16 = int Sl |
| ; eax + 20 = int Al |
| ; eax + 24 = JCOEF *values |
| ; eax + 28 = size_t *bits |
| |
| %define ZERO xmm7 |
| %define ONE xmm5 |
| %define X0 xmm0 |
| %define X1 xmm1 |
| %define N0 xmm2 |
| %define N1 xmm3 |
| %define AL xmm4 |
| %define K eax |
| %define LENEND eax |
| %define LUT ebx |
| %define T0 ecx |
| %define T0w cx |
| %define T1 edx |
| %define BLOCK esi |
| %define VALUES edi |
| %define KK ebp |
| |
| %define ZEROBITS INT [esp + 5 * 4] |
| %define EOB INT [esp + 5 * 4 + 4] |
| %define LEN INT [esp + 5 * 4 + 8] |
| |
| align 32 |
| GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) |
| |
| EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): |
| push ebp |
| mov eax, esp ; eax = original ebp |
| sub esp, byte 4 |
| and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
| mov [esp], eax |
| mov ebp, esp ; ebp = aligned ebp |
| sub esp, 16 |
| push ebx |
| push ecx |
| ; push edx ; need not be preserved |
| push esi |
| push edi |
| push ebp |
| |
| pcmpeqw ONE, ONE |
| psrlw ONE, 15 |
| mov BLOCK, INT [eax + 8] |
| mov LUT, INT [eax + 12] |
| mov VALUES, INT [eax + 24] |
| movd AL, INT [eax + 20] |
| mov T0, INT [eax + 28] |
| mov K, INT [eax + 16] |
| mov INT [T0 + 2 * SIZEOF_INT], -1 |
| mov INT [T0 + 3 * SIZEOF_INT], -1 |
| mov ZEROBITS, T0 |
| mov LEN, K |
| pxor ZERO, ZERO |
| and K, -16 |
| mov EOB, 0 |
| xor KK, KK |
| shr K, 4 |
| jz .ELOOPR16 |
| .BLOOPR16: |
| LOAD16 |
| pcmpgtw N0, X0 |
| pcmpgtw N1, X1 |
| paddw X0, N0 |
| paddw X1, N1 |
| pxor X0, N0 |
| pxor X1, N1 |
| psrlw X0, AL |
| psrlw X1, AL |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (8) * 2], X1 |
| pcmpeqw X0, ONE |
| pcmpeqw X1, ONE |
| packsswb N0, N1 |
| packsswb X0, X1 |
| pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); |
| mov T1, ZEROBITS |
| not T0 |
| mov word [T1 + 2 * SIZEOF_INT + KK], T0w |
| pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); |
| bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); |
| jz .CONTINUER16 ; if (idx) { |
| lea T1, [T1+KK*8] |
| mov EOB, T1 ; EOB = k + idx; |
| .CONTINUER16: |
| add VALUES, 16*2 |
| add LUT, 16*SIZEOF_INT |
| add KK, 2 |
| dec K |
| jnz .BLOOPR16 |
| .ELOOPR16: |
| mov LENEND, LEN |
| |
| test LENEND, 8 |
| jz .TRYR7 |
| test LENEND, 7 |
| jz .TRYR8 |
| |
| and LENEND, 7 |
| LOAD15 |
| pcmpgtw N0, X0 |
| pcmpgtw N1, X1 |
| paddw X0, N0 |
| paddw X1, N1 |
| pxor X0, N0 |
| pxor X1, N1 |
| psrlw X0, AL |
| psrlw X1, AL |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| movdqa XMMWORD [VALUES + (8) * 2], X1 |
| pcmpeqw X0, ONE |
| pcmpeqw X1, ONE |
| packsswb N0, N1 |
| packsswb X0, X1 |
| pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); |
| mov T1, ZEROBITS |
| not T0 |
| mov word [T1 + 2 * SIZEOF_INT + KK], T0w |
| pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); |
| bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); |
| jz .CONTINUER15 ; if (idx) { |
| lea T1, [T1+KK*8] |
| mov EOB, T1 ; EOB = k + idx; |
| .CONTINUER15: |
| add VALUES, 16*2 |
| jmp .PADDINGR |
| .TRYR8: |
| LOAD8 |
| |
| pcmpgtw N0, X0 |
| paddw X0, N0 |
| pxor X0, N0 |
| psrlw X0, AL |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| pcmpeqw X0, ONE |
| packsswb N0, ZERO |
| packsswb X0, ZERO |
| pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); |
| mov T1, ZEROBITS |
| not T0 |
| mov word [T1 + 2 * SIZEOF_INT + KK], T0w |
| pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); |
| bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); |
| jz .CONTINUER8 ; if (idx) { |
| lea T1, [T1+KK*8] |
| mov EOB, T1 ; EOB = k + idx; |
| .CONTINUER8: |
| add VALUES, 8*2 |
| jmp .PADDINGR |
| .TRYR7: |
| and LENEND, 7 |
| LOAD7 |
| |
| pcmpgtw N0, X0 |
| paddw X0, N0 |
| pxor X0, N0 |
| psrlw X0, AL |
| movdqa XMMWORD [VALUES + (0) * 2], X0 |
| pcmpeqw X0, ONE |
| packsswb N0, ZERO |
| packsswb X0, ZERO |
| pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); |
| mov T1, ZEROBITS |
| not T0 |
| mov word [T1 + 2 * SIZEOF_INT + KK], T0w |
| pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); |
| bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); |
| jz .CONTINUER7 ; if (idx) { |
| lea T1, [T1+KK*8] |
| mov EOB, T1 ; EOB = k + idx; |
| .CONTINUER7: |
| add VALUES, 8*2 |
| .PADDINGR: |
| mov K, LEN |
| add K, 7 |
| and K, -8 |
| shr K, 3 |
| sub K, DCTSIZE2/8 |
| jz .EPADDINGR |
| align 16 |
| .ZEROLOOPR: |
| movdqa XMMWORD [VALUES + 0], ZERO |
| add VALUES, 8*2 |
| inc K |
| jnz .ZEROLOOPR |
| .EPADDINGR: |
| sub VALUES, DCTSIZE2*2 |
| |
| REDUCE0 |
| |
| mov eax, EOB |
| |
| pop ebp |
| pop edi |
| pop esi |
| ; pop edx ; need not be preserved |
| pop ecx |
| pop ebx |
| mov esp, ebp ; esp <- aligned ebp |
| pop esp ; esp <- original ebp |
| pop ebp |
| ret |
| |
| %undef ZERO |
| %undef ONE |
| %undef X0 |
| %undef X1 |
| %undef N0 |
| %undef N1 |
| %undef AL |
| %undef K |
| %undef KK |
| %undef EOB |
| %undef SIGN |
| %undef LUT |
| %undef T0 |
| %undef T1 |
| %undef BLOCK |
| %undef VALUES |
| %undef LEN |
| %undef LENEND |
| |
| ; For some reason, the OS X linker does not honor the request to align the |
| ; segment unless we do this. |
| align 32 |