| ; |
| ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) |
| ; |
| ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. |
| ; Copyright (C) 2015, Matthieu Darbois. |
| ; Copyright (C) 2018, Matthias Rรคncker. |
| ; |
| ; Based on the x86 SIMD extension for IJG JPEG library |
| ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| ; |
| ; This file should be assembled with NASM (Netwide Assembler), |
| ; can *not* be assembled with Microsoft's MASM or any compatible |
| ; assembler (including Borland's Turbo Assembler). |
| ; NASM is available from http://nasm.sourceforge.net/ or |
| ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| ; |
| ; This file contains an SSE2 implementation for Huffman coding of one block. |
| ; The following code is based on jchuff.c; see jchuff.c for more details. |
| |
| %include "jsimdext.inc" |
| |
| struc working_state |
| .next_output_byte: resp 1 ; => next byte to write in buffer |
| .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer |
| .cur.put_buffer.simd resq 1 ; current bit accumulation buffer |
| .cur.free_bits resd 1 ; # of bits available in it |
| .cur.last_dc_val resd 4 ; last DC coef for each component |
| .cinfo: resp 1 ; dump_buffer needs access to this |
| endstruc |
| |
| struc c_derived_tbl |
| .ehufco: resd 256 ; code for each symbol |
| .ehufsi: resb 256 ; length of code for each symbol |
| ; If no code has been allocated for a symbol S, ehufsi[S] contains 0 |
| endstruc |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_CONST |
| |
| alignz 32 |
| GLOBAL_DATA(jconst_huff_encode_one_block) |
| |
| EXTN(jconst_huff_encode_one_block): |
| |
| jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 |
| dd 0x000f, 0x001f, 0x003f, 0x007f |
| dd 0x00ff, 0x01ff, 0x03ff, 0x07ff |
| dd 0x0fff, 0x1fff, 0x3fff, 0x7fff |
| |
| alignz 32 |
| |
| times 1 << 14 db 15 |
| times 1 << 13 db 14 |
| times 1 << 12 db 13 |
| times 1 << 11 db 12 |
| times 1 << 10 db 11 |
| times 1 << 9 db 10 |
| times 1 << 8 db 9 |
| times 1 << 7 db 8 |
| times 1 << 6 db 7 |
| times 1 << 5 db 6 |
| times 1 << 4 db 5 |
| times 1 << 3 db 4 |
| times 1 << 2 db 3 |
| times 1 << 1 db 2 |
| times 1 << 0 db 1 |
| times 1 db 0 |
| jpeg_nbits_table: |
| times 1 db 0 |
| times 1 << 0 db 1 |
| times 1 << 1 db 2 |
| times 1 << 2 db 3 |
| times 1 << 3 db 4 |
| times 1 << 4 db 5 |
| times 1 << 5 db 6 |
| times 1 << 6 db 7 |
| times 1 << 7 db 8 |
| times 1 << 8 db 9 |
| times 1 << 9 db 10 |
| times 1 << 10 db 11 |
| times 1 << 11 db 12 |
| times 1 << 12 db 13 |
| times 1 << 13 db 14 |
| times 1 << 14 db 15 |
| times 1 << 15 db 16 |
| |
| alignz 32 |
| |
| %define NBITS(x) nbits_base + x |
| %define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_TEXT |
| BITS 64 |
| |
| ; Shorthand used to describe SIMD operations: |
| ; wN: xmmN treated as eight signed 16-bit values |
| ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 |
| ; bN: xmmN treated as 16 unsigned 8-bit values |
| ; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 |
| ; Contents of SIMD registers are shown in memory order. |
| |
| ; Fill the bit buffer to capacity with the leading bits from code, then output |
| ; the bit buffer and put the remaining bits from code into the bit buffer. |
| ; |
| ; Usage: |
| ; code - contains the bits to shift into the bit buffer (LSB-aligned) |
| ; %1 - the label to which to jump when the macro completes |
| ; %2 (optional) - extra instructions to execute after nbits has been set |
| ; |
| ; Upon completion, free_bits will be set to the number of remaining bits from |
| ; code, and put_buffer will contain those remaining bits. temp and code will |
| ; be clobbered. |
| ; |
| ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() |
| ; macro in jchuff.c. |
| |
| %macro EMIT_QWORD 1-2 |
| add nbitsb, free_bitsb ; nbits += free_bits; |
| neg free_bitsb ; free_bits = -free_bits; |
| mov tempd, code ; temp = code; |
| shl put_buffer, nbitsb ; put_buffer <<= nbits; |
| mov nbitsb, free_bitsb ; nbits = free_bits; |
| neg free_bitsb ; free_bits = -free_bits; |
| shr tempd, nbitsb ; temp >>= nbits; |
| or tempq, put_buffer ; temp |= put_buffer; |
| movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; |
| bswap tempq ; temp = htonl(temp); |
| mov put_buffer, codeq ; put_buffer = code; |
| pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); |
| %2 |
| pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); |
| mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); |
| ; (speculative; will be overwritten if |
| ; code contains any 0xFF bytes) |
| add free_bitsb, 64 ; free_bits += 64; |
| add bufferp, 8 ; buffer += 8; |
| test code, code ; if (code == 0) /* No 0xFF bytes */ |
| jz %1 ; return; |
| ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 |
| ; bytes in the qword. |
| cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF |
| mov byte [buffer-7], 0 ; buffer[-7] = 0; |
| sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); |
| mov byte [buffer], temph ; buffer[0] = temp[1]; |
| cmp temph, 0xFF ; Set CF if temp[1] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); |
| shr tempq, 16 ; temp >>= 16; |
| mov byte [buffer], tempb ; buffer[0] = temp[0]; |
| cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); |
| mov byte [buffer], temph ; buffer[0] = temp[1]; |
| cmp temph, 0xFF ; Set CF if temp[1] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); |
| shr tempq, 16 ; temp >>= 16; |
| mov byte [buffer], tempb ; buffer[0] = temp[0]; |
| cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); |
| mov byte [buffer], temph ; buffer[0] = temp[1]; |
| cmp temph, 0xFF ; Set CF if temp[1] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); |
| shr tempd, 16 ; temp >>= 16; |
| mov byte [buffer], tempb ; buffer[0] = temp[0]; |
| cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); |
| mov byte [buffer], temph ; buffer[0] = temp[1]; |
| cmp temph, 0xFF ; Set CF if temp[1] < 0xFF |
| mov byte [buffer+1], 0 ; buffer[1] = 0; |
| sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); |
| jmp %1 ; return; |
| %endmacro |
| |
| ; |
| ; Encode a single block's worth of coefficients. |
| ; |
| ; GLOBAL(JOCTET *) |
| ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, |
| ; JCOEFPTR block, int last_dc_val, |
| ; c_derived_tbl *dctbl, c_derived_tbl *actbl) |
| ; |
| ; NOTES: |
| ; When shuffling data, we try to avoid pinsrw as much as possible, since it is |
| ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on |
| ; modern CPUs, so chains of pinsrw instructions (even with different outputs) |
| ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and |
| ; requires 2 ยตops (with memory operand) on Intel. In either case, only one |
| ; pinsrw instruction can be decoded per cycle (and nothing else if they are |
| ; back-to-back), so out-of-order execution cannot be used to work around long |
| ; pinsrw chains (though for Sandy Bridge and later, this may be less of a |
| ; problem if the code runs from the ยตop cache.) |
| ; |
| ; We use tzcnt instead of bsf without checking for support. The instruction is |
| ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to |
| ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is |
| ; an input dependency (although the behavior is not formally defined, Intel |
| ; CPUs usually leave the destination unmodified if the source is zero.) This |
| ; can prevent out-of-order execution, so we clear the destination before |
| ; invoking tzcnt. |
| ; |
| ; Initial register allocation |
| ; rax - buffer |
| ; rbx - temp |
| ; rcx - nbits |
| ; rdx - block --> free_bits |
| ; rsi - nbits_base |
| ; rdi - t |
| ; rbp - code |
| ; r8 - dctbl --> code_temp |
| ; r9 - actbl |
| ; r10 - state |
| ; r11 - index |
| ; r12 - put_buffer |
| |
| %define buffer rax |
| %ifdef WIN64 |
| %define bufferp rax |
| %else |
| %define bufferp raxp |
| %endif |
| %define tempq rbx |
| %define tempd ebx |
| %define tempb bl |
| %define temph bh |
| %define nbitsq rcx |
| %define nbits ecx |
| %define nbitsb cl |
| %define block rdx |
| %define nbits_base rsi |
| %define t rdi |
| %define td edi |
| %define codeq rbp |
| %define code ebp |
| %define dctbl r8 |
| %define actbl r9 |
| %define state r10 |
| %define index r11 |
| %define indexd r11d |
| %define put_buffer r12 |
| %define put_bufferd r12d |
| |
| ; Step 1: Re-arrange input data according to jpeg_natural_order |
| ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 |
| ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 |
| ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 |
| ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 |
| ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 |
| ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 |
| ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 |
| ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 |
| |
| align 32 |
| GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) |
| |
| EXTN(jsimd_huff_encode_one_block_sse2): |
| |
| %ifdef WIN64 |
| |
| ; rcx = working_state *state |
| ; rdx = JOCTET *buffer |
| ; r8 = JCOEFPTR block |
| ; r9 = int last_dc_val |
| ; [rax+48] = c_derived_tbl *dctbl |
| ; [rax+56] = c_derived_tbl *actbl |
| |
| ;X: X = code stream |
| mov buffer, rdx |
| mov block, r8 |
| movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 |
| push rbx |
| push rbp |
| movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 |
| push rsi |
| push rdi |
| push r12 |
| movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 |
| mov state, rcx |
| movsx code, word [block] ;Z: code = block[0]; |
| pxor xmm4, xmm4 ;A: w4[i] = 0; |
| sub code, r9d ;Z: code -= last_dc_val; |
| mov dctbl, POINTER [rsp+6*8+4*8] |
| mov actbl, POINTER [rsp+6*8+5*8] |
| punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 |
| lea nbits_base, [rel jpeg_nbits_table] |
| add rsp, -DCTSIZE2 * SIZEOF_WORD |
| mov t, rsp |
| |
| %else |
| |
| ; rdi = working_state *state |
| ; rsi = JOCTET *buffer |
| ; rdx = JCOEFPTR block |
| ; rcx = int last_dc_val |
| ; r8 = c_derived_tbl *dctbl |
| ; r9 = c_derived_tbl *actbl |
| |
| ;X: X = code stream |
| movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 |
| push rbx |
| push rbp |
| movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 |
| push r12 |
| mov state, rdi |
| mov buffer, rsi |
| movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 |
| movsx codeq, word [block] ;Z: code = block[0]; |
| lea nbits_base, [rel jpeg_nbits_table] |
| pxor xmm4, xmm4 ;A: w4[i] = 0; |
| sub codeq, rcx ;Z: code -= last_dc_val; |
| punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 |
| lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ |
| |
| %endif |
| |
| pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 |
| pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 |
| punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 |
| punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 |
| pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 |
| ;A: (Row 0, offset 1) |
| pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); |
| paddw xmm0, xmm4 ;A: w0[i] += w4[i]; |
| movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; |
| |
| movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- |
| pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- |
| pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 |
| movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 |
| movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 |
| punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 |
| pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 |
| pxor xmm4, xmm4 ;A: w4[i] = 0; |
| psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- |
| pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); |
| pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 |
| ; (Row 1, offset 1) |
| pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); |
| paddw xmm1, xmm4 ;B: w1[i] += w4[i]; |
| movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; |
| pxor xmm4, xmm4 ;B: w4[i] = 0; |
| pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); |
| |
| packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] |
| ; w/ signed saturation |
| |
| pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- |
| pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- |
| pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- |
| pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 |
| ; (Row 3, offset 1) |
| pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); |
| paddw xmm3, xmm4 ;D: w3[i] += w4[i]; |
| movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; |
| pxor xmm4, xmm4 ;D: w4[i] = 0; |
| pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); |
| |
| pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 |
| cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000, |
| ;Z: i.e. if code is positive |
| pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 |
| pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 |
| adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0); |
| pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 |
| pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 |
| movsxd codeq, code ;Z: sign extend code |
| pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 |
| ; (Row 2, offset 1) |
| pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); |
| paddw xmm2, xmm4 ;C: w2[i] += w4[i]; |
| movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; |
| pxor xmm4, xmm4 ;C: w4[i] = 0; |
| pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); |
| |
| packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] |
| ; w/ signed saturation |
| |
| movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); |
| movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 |
| pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); |
| pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); |
| movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 |
| punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 |
| shl tempd, 16 ;Z: temp <<= 16; |
| psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- |
| pxor xmm2, xmm2 ;H: w2[i] = 0; |
| or put_bufferd, tempd ;Z: put_buffer |= temp; |
| pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- |
| movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- |
| unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 |
| pxor xmm0, xmm0 ;H: w0[i] = 0; |
| pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- |
| ; (Row 7, offset 1) |
| pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); |
| paddw xmm3, xmm2 ;H: w3[i] += w2[i]; |
| movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; |
| movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- |
| pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); |
| punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 |
| mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] |
| ;Z: temp = dctbl->ehufco[nbits]; |
| movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 |
| psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- |
| shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 |
| and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; |
| pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- |
| pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 |
| shl tempq, nbitsb ;Z: temp <<= nbits; |
| pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- |
| pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 |
| pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- |
| or code, tempd ;Z: code |= temp; |
| movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 |
| pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- |
| pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 |
| pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 |
| ; (Row 6, offset 1) |
| pxor xmm2, xmm2 ;G: w2[i] = 0; |
| pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); |
| pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 |
| paddw xmm4, xmm0 ;G: w4[i] += w0[i]; |
| movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; |
| pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 |
| ; (Row 5, offset 1) |
| pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); |
| pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 |
| |
| packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] |
| ; w/ signed saturation |
| |
| pxor xmm0, xmm0 ;F: w0[i] = 0; |
| pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 |
| pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); |
| pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); |
| pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 |
| paddw xmm1, xmm2 ;F: w1[i] += w2[i]; |
| movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; |
| pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 |
| ; (Row 4, offset 1) |
| %undef block |
| %define free_bitsq rdx |
| %define free_bitsd edx |
| %define free_bitsb dl |
| pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); |
| shl tempq, 48 ;Z: temp <<= 48; |
| pxor xmm2, xmm2 ;E: w2[i] = 0; |
| pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); |
| paddw xmm5, xmm0 ;E: w5[i] += w0[i]; |
| or tempq, put_buffer ;Z: temp |= put_buffer; |
| movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; |
| lea t, [dword t - 2] ;Z: t = &t[-1]; |
| pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); |
| |
| packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] |
| ; w/ signed saturation |
| |
| add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq] |
| ;Z: nbits += dctbl->ehufsi[nbits]; |
| %undef dctbl |
| %define code_temp r8d |
| pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); |
| mov free_bitsd, [state+working_state.cur.free_bits] |
| ;Z: free_bits = state->cur.free_bits; |
| pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; |
| shl index, 32 ;Z: index <<= 32; |
| mov put_buffer, [state+working_state.cur.put_buffer.simd] |
| ;Z: put_buffer = state->cur.put_buffer.simd; |
| or index, tempq ;Z: index |= temp; |
| not index ;Z: index = ~index; |
| sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) |
| jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; |
| align 16 |
| .EMIT_CODE: ;Z: .EMIT_CODE: |
| EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND |
| |
| ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| |
| align 16 |
| .BRLOOP: ; do { |
| lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; |
| movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] |
| ; nbits = actbl->ehufsi[0xf0]; |
| mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] |
| ; code = actbl->ehufco[0xf0]; |
| sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) |
| jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; |
| shl put_buffer, nbitsb ; put_buffer <<= nbits; |
| mov nbits, code_temp ; nbits = code_temp; |
| or put_buffer, codeq ; put_buffer |= code; |
| cmp nbits, 16 ; if (nbits <= 16) |
| jle .ERLOOP ; break; |
| jmp .BRLOOP ; } while (1); |
| |
| ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| |
| align 16 |
| times 5 nop |
| .ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: |
| shl put_buffer, nbitsb ; put_buffer <<= nbits; |
| or put_buffer, codeq ; put_buffer |= code; |
| .BLOOP_COND: ; .BLOOP_COND: |
| test index, index ; if (index != 0) |
| jz .ELOOP ; { |
| .BLOOP: ; do { |
| xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ |
| tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index |
| inc nbits ; ++nbits; |
| lea t, [t + nbitsq * 2] ; t = &t[nbits]; |
| shr index, nbitsb ; index >>= nbits; |
| .EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: |
| cmp nbits, 16 ; if (nbits > 16) |
| jg .BRLOOP ; goto .BRLOOP; |
| .ERLOOP: ; .ERLOOP: |
| movsx codeq, word [t] ; code = *t; |
| lea tempd, [nbitsq * 2] ; temp = nbits * 2; |
| movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); |
| lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; |
| mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] |
| ; code_temp = actbl->ehufco[temp-16]; |
| shl code_temp, nbitsb ; code_temp <<= nbits; |
| and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; |
| add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] |
| ; free_bits -= actbl->ehufsi[temp-16]; |
| or code, code_temp ; code |= code_temp; |
| sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) |
| jle .EMIT_CODE ; goto .EMIT_CODE; |
| shl put_buffer, nbitsb ; put_buffer <<= nbits; |
| or put_buffer, codeq ; put_buffer |= code; |
| test index, index |
| jnz .BLOOP ; } while (index != 0); |
| .ELOOP: ; } /* index != 0 */ |
| sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); |
| %ifdef WIN64 |
| cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) |
| %else |
| cmp td, -2 * SIZEOF_WORD ; if (t != -2) |
| %endif |
| je .EFN ; { |
| movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] |
| ; nbits = actbl->ehufsi[0]; |
| mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; |
| sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) |
| jg .EFN_SKIP_EMIT_CODE ; { |
| EMIT_QWORD .EFN ; insert code, flush buffer |
| align 16 |
| .EFN_SKIP_EMIT_CODE: ; } else { |
| shl put_buffer, nbitsb ; put_buffer <<= nbits; |
| or put_buffer, codeq ; put_buffer |= code; |
| .EFN: ; } } |
| mov [state + working_state.cur.put_buffer.simd], put_buffer |
| ; state->cur.put_buffer.simd = put_buffer; |
| mov byte [state + working_state.cur.free_bits], free_bitsb |
| ; state->cur.free_bits = free_bits; |
| %ifdef WIN64 |
| sub rsp, -DCTSIZE2 * SIZEOF_WORD |
| pop r12 |
| pop rdi |
| pop rsi |
| pop rbp |
| pop rbx |
| %else |
| pop r12 |
| pop rbp |
| pop rbx |
| %endif |
| ret |
| |
| ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| |
| align 16 |
| .EMIT_BRLOOP_CODE: |
| EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp } |
| ; insert code, flush buffer, |
| ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END |
| |
| ; For some reason, the OS X linker does not honor the request to align the |
| ; segment unless we do this. |
| align 32 |