third_party/flac/src/libFLAC/ia32/lpc_asm.nasm - cobalt - Git at Google

 ;  vim:filetype=nasm ts=8

 ;  libFLAC - Free Lossless Audio Codec library
 ;  Copyright (C) 2001-2009  Josh Coalson
 ;  Copyright (C) 2011-2014  Xiph.Org Foundation
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
 ;  are met:
 ;
 ;  - Redistributions of source code must retain the above copyright
 ;  notice, this list of conditions and the following disclaimer.
 ;
 ;  - Redistributions in binary form must reproduce the above copyright
 ;  notice, this list of conditions and the following disclaimer in the
 ;  documentation and/or other materials provided with the distribution.
 ;
 ;  - Neither the name of the Xiph.org Foundation nor the names of its
 ;  contributors may be used to endorse or promote products derived from
 ;  this software without specific prior written permission.
 ;
 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 %include "nasm.h"

 	data_section

 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
 cglobal FLAC__lpc_restore_signal_asm_ia32
 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
 cglobal FLAC__lpc_restore_signal_wide_asm_ia32

 	code_section

 ; **********************************************************************
 ;
 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 ; {
 ;	FLAC__real d;
 ;	unsigned sample, coeff;
 ;	const unsigned limit = data_len - lag;
 ;
 ;	FLAC__ASSERT(lag > 0);
 ;	FLAC__ASSERT(lag <= data_len);
 ;
 ;	for(coeff = 0; coeff < lag; coeff++)
 ;		autoc[coeff] = 0.0;
 ;	for(sample = 0; sample <= limit; sample++) {
 ;		d = data[sample];
 ;		for(coeff = 0; coeff < lag; coeff++)
 ;			autoc[coeff] += d * data[sample+coeff];
 ;	}
 ;	for(; sample < data_len; sample++) {
 ;		d = data[sample];
 ;		for(coeff = 0; coeff < data_len - sample; coeff++)
 ;			autoc[coeff] += d * data[sample+coeff];
 ;	}
 ; }
 ;
 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32
 	;[esp + 28] == autoc[]
 	;[esp + 24] == lag
 	;[esp + 20] == data_len
 	;[esp + 16] == data[]

 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 33)
 	;ASSERT(lag <= data_len)

 .begin:
 	push	esi
 	push	edi
 	push	ebx

 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	mov	edi, [esp + 28]			; edi == autoc
 	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
 	xor	eax, eax
 	rep	stosd

 	;	const unsigned limit = data_len - lag;
 	mov	eax, [esp + 24]			; eax == lag
 	mov	ecx, [esp + 20]
 	sub	ecx, eax			; ecx == limit

 	mov	edi, [esp + 28]			; edi == autoc
 	mov	esi, [esp + 16]			; esi == data
 	inc	ecx				; we are looping <= limit so we add one to the counter

 	;	for(sample = 0; sample <= limit; sample++) {
 	;		d = data[sample];
 	;		for(coeff = 0; coeff < lag; coeff++)
 	;			autoc[coeff] += d * data[sample+coeff];
 	;	}
 	fld	dword [esi]			; ST = d <- data[sample]
 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 	lea	edx, [eax + eax*2]
 	neg	edx
 	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
 	call	.mov_eip_to_ebx
 .get_eip1:
 	add	edx, ebx
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	cmp	eax, 33
 	jne	.loop1_start
 	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
 .loop1_start:
 	jmp	edx

 .mov_eip_to_ebx:
 	mov	ebx, [esp]
 	ret

 	fld	st0				; ST = d d
 	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
 	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
 	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
 	fld	st0				; ST = d d
 	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
 	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
 	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
 	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
 	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
 	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
 	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
 	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
 	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
 	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
 	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
 	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
 	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
 	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
 	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
 	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
 	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
 	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
 	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
 	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
 	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
 	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
 	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
 	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
 	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
 	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
 	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
 	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
 	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
 	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
 	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
 	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
 	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
 	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
 	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
 	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
 	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
 	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
 	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
 	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
 	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
 	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
 	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
 	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
 	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
 	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
 	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
 	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
 	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
 	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
 	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
 	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
 	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
 	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
 	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
 	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
 	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
 	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
 	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
 	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
 	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
 	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
 	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
 	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
 	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
 .jumper1_0:

 	fstp	st0				; pop d, ST = empty
 	add	esi, byte 4			; sample++
 	dec	ecx
 	jz	.loop1_end
 	fld	dword [esi]			; ST = d <- data[sample]
 	jmp	edx
 .loop1_end:

 	;	for(; sample < data_len; sample++) {
 	;		d = data[sample];
 	;		for(coeff = 0; coeff < data_len - sample; coeff++)
 	;			autoc[coeff] += d * data[sample+coeff];
 	;	}
 	mov	ecx, [esp + 24]			; ecx <- lag
 	dec	ecx				; ecx <- lag - 1
 	jz	near .end			; skip loop if 0 (i.e. lag == 1)

 	fld	dword [esi]			; ST = d <- data[sample]
 	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 	lea	edx, [eax + eax*2]
 	neg	edx
 	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
 	call	.mov_eip_to_ebx
 .get_eip2:
 	add	edx, ebx
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	jmp	edx

 	fld	st0				; ST = d d
 	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
 	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
 	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
 	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
 	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
 	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
 	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
 	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
 	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
 	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
 	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
 	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
 	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
 	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
 	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
 	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
 	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
 	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
 	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
 	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
 	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
 	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
 	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
 	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
 	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
 	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
 	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
 	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
 	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
 	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
 	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
 	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
 	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
 	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
 	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
 	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
 	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
 	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
 	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
 	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
 	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
 	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
 	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
 	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
 	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
 	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
 	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
 	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
 	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
 	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
 	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
 	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
 	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
 	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
 	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
 	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
 	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
 	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
 	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
 	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
 	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
 	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
 	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
 	fld	st0				; ST = d d
 	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
 	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
 	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
 .jumper2_0:

 	fstp	st0				; pop d, ST = empty
 	add	esi, byte 4			; sample++
 	dec	ecx
 	jz	.loop2_end
 	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
 	fld	dword [esi]			; ST = d <- data[sample]
 	jmp	edx
 .loop2_end:

 .end:
 	pop	ebx
 	pop	edi
 	pop	esi
 	ret

 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
 	;[esp + 16] == autoc[]
 	;[esp + 12] == lag
 	;[esp + 8] == data_len
 	;[esp + 4] == data[]

 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 4)
 	;ASSERT(lag <= data_len)

 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	xorps	xmm5, xmm5

 	mov	edx, [esp + 8]			; edx == data_len
 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
 	add	eax, 4
 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 .warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
 	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
 	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
 	dec	edx
 	jz	.loop_end
 	ALIGN 16
 .loop_start:
 	; start by reading the next sample
 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
 	add	eax, 4
 	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
 	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 	movss	xmm2, xmm0
 	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
 	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
 	dec	edx
 	jnz	.loop_start
 .loop_end:
 	; store autoc
 	mov	edx, [esp + 16]			; edx == autoc
 	movups	[edx], xmm5

 .end:
 	ret

 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
 	;[esp + 16] == autoc[]
 	;[esp + 12] == lag
 	;[esp + 8] == data_len
 	;[esp + 4] == data[]

 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 8)
 	;ASSERT(lag <= data_len)

 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	xorps	xmm5, xmm5
 	xorps	xmm6, xmm6

 	mov	edx, [esp + 8]			; edx == data_len
 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
 	add	eax, 4
 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
 .warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 	mulps	xmm0, xmm2
 	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 	addps	xmm5, xmm0
 	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 	dec	edx
 	jz	.loop_end
 	ALIGN 16
 .loop_start:
 	; start by reading the next sample
 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
 	; here we reorder the instructions; see the (#) indexes for a logical order
 	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
 	add	eax, 4				; (0)
 	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
 	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
 	movss	xmm3, xmm2			; (5)
 	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
 	movss	xmm2, xmm0			; (6)
 	mulps	xmm1, xmm3			; (8)
 	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 	addps	xmm6, xmm1			; (10)
 	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 	dec	edx
 	jnz	.loop_start
 .loop_end:
 	; store autoc
 	mov	edx, [esp + 16]			; edx == autoc
 	movups	[edx], xmm5
 	movups	[edx + 16], xmm6

 .end:
 	ret

 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
 	;[esp + 16] == autoc[]
 	;[esp + 12] == lag
 	;[esp + 8] == data_len
 	;[esp + 4] == data[]

 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 12)
 	;ASSERT(lag <= data_len)

 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	xorps	xmm5, xmm5
 	xorps	xmm6, xmm6
 	xorps	xmm7, xmm7

 	mov	edx, [esp + 8]			; edx == data_len
 	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
 	add	eax, 4
 	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
 	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
 	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
 .warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 	movaps	xmm1, xmm0
 	mulps	xmm1, xmm2
 	addps	xmm5, xmm1
 	movaps	xmm1, xmm0
 	mulps	xmm1, xmm3
 	addps	xmm6, xmm1
 	mulps	xmm0, xmm4
 	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
 	dec	edx
 	jz	.loop_end
 	ALIGN 16
 .loop_start:
 	; start by reading the next sample
 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
 	add	eax, 4
 	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]

 	; shift xmm4:xmm3:xmm2 left by one float
 	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
 	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
 	movss	xmm4, xmm3
 	movss	xmm3, xmm2
 	movss	xmm2, xmm0

 	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
 	movaps	xmm1, xmm0
 	mulps	xmm1, xmm2
 	addps	xmm5, xmm1
 	movaps	xmm1, xmm0
 	mulps	xmm1, xmm3
 	addps	xmm6, xmm1
 	mulps	xmm0, xmm4
 	addps	xmm7, xmm0

 	dec	edx
 	jnz	.loop_start
 .loop_end:
 	; store autoc
 	mov	edx, [esp + 16]			; edx == autoc
 	movups	[edx], xmm5
 	movups	[edx + 16], xmm6
 	movups	[edx + 32], xmm7

 .end:
 	ret

 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
 	;[ebp + 20] == autoc[]
 	;[ebp + 16] == lag
 	;[ebp + 12] == data_len
 	;[ebp +  8] == data[]
 	;[esp] == __m128
 	;[esp + 16] == __m128

 	push	ebp
 	mov	ebp, esp
 	and	esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
 	sub	esp, 32

 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 12)
 	;ASSERT(lag <= data_len)
 	;ASSERT(data_len > 0)

 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
 	xorps	xmm5, xmm5
 	xorps	xmm6, xmm6
 	movaps	[esp], xmm5
 	movaps	[esp + 16], xmm6

 	mov	edx, [ebp + 12]			; edx == data_len
 	mov	eax, [ebp +  8]			; eax == &data[sample] <- &data[0]

 	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
 	add	eax, 4
 	movaps	xmm1, xmm0			; xmm1 = 0,0,0,data[0]
 	shufps	xmm0, xmm0, 0		; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 	xorps	xmm2, xmm2			; xmm2 = 0,0,0,0
 	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
 	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
 	movaps	xmm7, xmm0
 	mulps	xmm7, xmm1
 	addps	xmm5, xmm7
 	dec	edx
 	jz	.loop_end
 	ALIGN 16
 .loop_start:
 	; start by reading the next sample
 	movss	xmm0, [eax]				; xmm0 = 0,0,0,data[sample]
 	add	eax, 4
 	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]

 	; shift xmm4:xmm3:xmm2:xmm1 left by one float
 	shufps	xmm1, xmm1, 93h
 	shufps	xmm2, xmm2, 93h
 	shufps	xmm3, xmm3, 93h
 	shufps	xmm4, xmm4, 93h
 	movss	xmm4, xmm3
 	movss	xmm3, xmm2
 	movss	xmm2, xmm1
 	movss	xmm1, xmm0

 	; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
 	movaps	xmm7, xmm0
 	mulps	xmm7, xmm1
 	addps	xmm5, xmm7
 	movaps	xmm7, xmm0
 	mulps	xmm7, xmm2
 	addps	xmm6, xmm7
 	movaps	xmm7, xmm0
 	mulps	xmm7, xmm3
 	mulps	xmm0, xmm4
 	addps	xmm7, [esp]
 	addps	xmm0, [esp + 16]
 	movaps	[esp], xmm7
 	movaps	[esp + 16], xmm0

 	dec	edx
 	jnz	.loop_start
 .loop_end:
 	; store autoc
 	mov	edx, [ebp + 20]				; edx == autoc
 	movups	[edx], xmm5
 	movups	[edx + 16], xmm6
 	movaps	xmm5, [esp]
 	movaps	xmm6, [esp + 16]
 	movups	[edx + 32], xmm5
 	movups	[edx + 48], xmm6
 .end:
 	mov	esp, ebp
 	pop	ebp
 	ret

 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 ;
 ;	for(i = 0; i < data_len; i++) {
 ;		sum = 0;
 ;		for(j = 0; j < order; j++)
 ;			sum += qlp_coeff[j] * data[i-j-1];
 ;		residual[i] = data[i] - (sum >> lp_quantization);
 ;	}
 ;
 	ALIGN	16
 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 	;[esp + 40]	residual[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	data[]

 	;ASSERT(order > 0)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	esi, [esp + 20]			; esi = data[]
 	mov	edi, [esp + 40]			; edi = residual[]
 	mov	eax, [esp + 32]			; eax = order
 	mov	ebx, [esp + 24]			; ebx = data_len

 	test	ebx, ebx
 	jz	near .end			; do nothing if data_len == 0
 .begin:
 	cmp	eax, byte 1
 	jg	short .i_1more

 	mov	ecx, [esp + 28]
 	mov	edx, [ecx]			; edx = qlp_coeff[0]
 	mov	eax, [esi - 4]			; eax = data[-1]
 	mov	ecx, [esp + 36]			; cl = lp_quantization
 	ALIGN	16
 .i_1_loop_i:
 	imul	eax, edx
 	sar	eax, cl
 	neg	eax
 	add	eax, [esi]
 	mov	[edi], eax
 	mov	eax, [esi]
 	add	edi, byte 4
 	add	esi, byte 4
 	dec	ebx
 	jnz	.i_1_loop_i

 	jmp	.end

 .i_1more:
 	cmp	eax, byte 32			; for order <= 32 there is a faster routine
 	jbe	short .i_32

 	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
 	ALIGN 16
 .i_32more_loop_i:
 	xor	ebp, ebp
 	mov	ecx, [esp + 32]
 	mov	edx, ecx
 	shl	edx, 2
 	add	edx, [esp + 28]
 	neg	ecx
 	ALIGN	16
 .i_32more_loop_j:
 	sub	edx, byte 4
 	mov	eax, [edx]
 	imul	eax, [esi + 4 * ecx]
 	add	ebp, eax
 	inc	ecx
 	jnz	short .i_32more_loop_j

 	mov	ecx, [esp + 36]
 	sar	ebp, cl
 	neg	ebp
 	add	ebp, [esi]
 	mov	[edi], ebp
 	add	esi, byte 4
 	add	edi, byte 4

 	dec	ebx
 	jnz	.i_32more_loop_i

 	jmp	.end

 .mov_eip_to_eax:
 	mov	eax, [esp]
 	ret

 .i_32:
 	sub	edi, esi
 	neg	eax
 	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
 	call	.mov_eip_to_eax
 .get_eip0:
 	add	edx, eax
 	inc	edx
 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
 	xor	ebp, ebp
 	jmp	edx

 	mov	ecx, [eax + 124]
 	imul	ecx, [esi - 128]
 	add	ebp, ecx
 	mov	ecx, [eax + 120]
 	imul	ecx, [esi - 124]
 	add	ebp, ecx
 	mov	ecx, [eax + 116]
 	imul	ecx, [esi - 120]
 	add	ebp, ecx
 	mov	ecx, [eax + 112]
 	imul	ecx, [esi - 116]
 	add	ebp, ecx
 	mov	ecx, [eax + 108]
 	imul	ecx, [esi - 112]
 	add	ebp, ecx
 	mov	ecx, [eax + 104]
 	imul	ecx, [esi - 108]
 	add	ebp, ecx
 	mov	ecx, [eax + 100]
 	imul	ecx, [esi - 104]
 	add	ebp, ecx
 	mov	ecx, [eax + 96]
 	imul	ecx, [esi - 100]
 	add	ebp, ecx
 	mov	ecx, [eax + 92]
 	imul	ecx, [esi - 96]
 	add	ebp, ecx
 	mov	ecx, [eax + 88]
 	imul	ecx, [esi - 92]
 	add	ebp, ecx
 	mov	ecx, [eax + 84]
 	imul	ecx, [esi - 88]
 	add	ebp, ecx
 	mov	ecx, [eax + 80]
 	imul	ecx, [esi - 84]
 	add	ebp, ecx
 	mov	ecx, [eax + 76]
 	imul	ecx, [esi - 80]
 	add	ebp, ecx
 	mov	ecx, [eax + 72]
 	imul	ecx, [esi - 76]
 	add	ebp, ecx
 	mov	ecx, [eax + 68]
 	imul	ecx, [esi - 72]
 	add	ebp, ecx
 	mov	ecx, [eax + 64]
 	imul	ecx, [esi - 68]
 	add	ebp, ecx
 	mov	ecx, [eax + 60]
 	imul	ecx, [esi - 64]
 	add	ebp, ecx
 	mov	ecx, [eax + 56]
 	imul	ecx, [esi - 60]
 	add	ebp, ecx
 	mov	ecx, [eax + 52]
 	imul	ecx, [esi - 56]
 	add	ebp, ecx
 	mov	ecx, [eax + 48]
 	imul	ecx, [esi - 52]
 	add	ebp, ecx
 	mov	ecx, [eax + 44]
 	imul	ecx, [esi - 48]
 	add	ebp, ecx
 	mov	ecx, [eax + 40]
 	imul	ecx, [esi - 44]
 	add	ebp, ecx
 	mov	ecx, [eax + 36]
 	imul	ecx, [esi - 40]
 	add	ebp, ecx
 	mov	ecx, [eax + 32]
 	imul	ecx, [esi - 36]
 	add	ebp, ecx
 	mov	ecx, [eax + 28]
 	imul	ecx, [esi - 32]
 	add	ebp, ecx
 	mov	ecx, [eax + 24]
 	imul	ecx, [esi - 28]
 	add	ebp, ecx
 	mov	ecx, [eax + 20]
 	imul	ecx, [esi - 24]
 	add	ebp, ecx
 	mov	ecx, [eax + 16]
 	imul	ecx, [esi - 20]
 	add	ebp, ecx
 	mov	ecx, [eax + 12]
 	imul	ecx, [esi - 16]
 	add	ebp, ecx
 	mov	ecx, [eax + 8]
 	imul	ecx, [esi - 12]
 	add	ebp, ecx
 	mov	ecx, [eax + 4]
 	imul	ecx, [esi - 8]
 	add	ebp, ecx
 	mov	ecx, [eax]			; there is one byte missing
 	imul	ecx, [esi - 4]
 	add	ebp, ecx
 .jumper_0:

 	mov	ecx, [esp + 36]
 	sar	ebp, cl
 	neg	ebp
 	add	ebp, [esi]
 	mov	[edi + esi], ebp
 	add	esi, byte 4

 	dec	ebx
 	jz	short .end
 	xor	ebp, ebp
 	jmp	edx

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
 ; cannot be used for side-channel coded 16bps channels since the effective bps
 ; is 17.
 	ALIGN	16
 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 	;[esp + 40]	residual[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	data[]

 	;ASSERT(order > 0)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	esi, [esp + 20]			; esi = data[]
 	mov	edi, [esp + 40]			; edi = residual[]
 	mov	eax, [esp + 32]			; eax = order
 	mov	ebx, [esp + 24]			; ebx = data_len

 	test	ebx, ebx
 	jz	near .end			; do nothing if data_len == 0
 	dec	ebx
 	test	ebx, ebx
 	jz	near .last_one

 	mov	edx, [esp + 28]			; edx = qlp_coeff[]
 	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
 	mov	ebp, esp

 	and	esp, 0xfffffff8

 	xor	ecx, ecx
 .copy_qlp_loop:
 	push	word [edx + 4 * ecx]
 	inc	ecx
 	cmp	ecx, eax
 	jnz	short .copy_qlp_loop

 	and	ecx, 0x3
 	test	ecx, ecx
 	je	short .za_end
 	sub	ecx, byte 4
 .za_loop:
 	push	word 0
 	inc	eax
 	inc	ecx
 	jnz	short .za_loop
 .za_end:

 	movq	mm5, [esp + 2 * eax - 8]
 	movd	mm4, [esi - 16]
 	punpckldq	mm4, [esi - 12]
 	movd	mm0, [esi - 8]
 	punpckldq	mm0, [esi - 4]
 	packssdw	mm4, mm0

 	cmp	eax, byte 4
 	jnbe	short .mmx_4more

 	ALIGN	16
 .mmx_4_loop_i:
 	movd	mm1, [esi]
 	movq	mm3, mm4
 	punpckldq	mm1, [esi + 4]
 	psrlq	mm4, 16
 	movq	mm0, mm1
 	psllq	mm0, 48
 	por	mm4, mm0
 	movq	mm2, mm4
 	psrlq	mm4, 16
 	pxor	mm0, mm0
 	punpckhdq	mm0, mm1
 	pmaddwd	mm3, mm5
 	pmaddwd	mm2, mm5
 	psllq	mm0, 16
 	por	mm4, mm0
 	movq	mm0, mm3
 	punpckldq	mm3, mm2
 	punpckhdq	mm0, mm2
 	paddd	mm3, mm0
 	psrad	mm3, mm6
 	psubd	mm1, mm3
 	movd	[edi], mm1
 	punpckhdq	mm1, mm1
 	movd	[edi + 4], mm1

 	add	edi, byte 8
 	add	esi, byte 8

 	sub	ebx, 2
 	jg	.mmx_4_loop_i
 	jmp	.mmx_end

 .mmx_4more:
 	shl	eax, 2
 	neg	eax
 	add	eax, byte 16

 	ALIGN	16
 .mmx_4more_loop_i:
 	movd	mm1, [esi]
 	punpckldq	mm1, [esi + 4]
 	movq	mm3, mm4
 	psrlq	mm4, 16
 	movq	mm0, mm1
 	psllq	mm0, 48
 	por	mm4, mm0
 	movq	mm2, mm4
 	psrlq	mm4, 16
 	pxor	mm0, mm0
 	punpckhdq	mm0, mm1
 	pmaddwd	mm3, mm5
 	pmaddwd	mm2, mm5
 	psllq	mm0, 16
 	por	mm4, mm0

 	mov	ecx, esi
 	add	ecx, eax
 	mov	edx, esp

 	ALIGN	16
 .mmx_4more_loop_j:
 	movd	mm0, [ecx - 16]
 	movd	mm7, [ecx - 8]
 	punpckldq	mm0, [ecx - 12]
 	punpckldq	mm7, [ecx - 4]
 	packssdw	mm0, mm7
 	pmaddwd	mm0, [edx]
 	punpckhdq	mm7, mm7
 	paddd	mm3, mm0
 	movd	mm0, [ecx - 12]
 	punpckldq	mm0, [ecx - 8]
 	punpckldq	mm7, [ecx]
 	packssdw	mm0, mm7
 	pmaddwd	mm0, [edx]
 	paddd	mm2, mm0

 	add	edx, byte 8
 	add	ecx, byte 16
 	cmp	ecx, esi
 	jnz	.mmx_4more_loop_j

 	movq	mm0, mm3
 	punpckldq	mm3, mm2
 	punpckhdq	mm0, mm2
 	paddd	mm3, mm0
 	psrad	mm3, mm6
 	psubd	mm1, mm3
 	movd	[edi], mm1
 	punpckhdq	mm1, mm1
 	movd	[edi + 4], mm1

 	add	edi, byte 8
 	add	esi, byte 8

 	sub	ebx, 2
 	jg	near .mmx_4more_loop_i

 .mmx_end:
 	emms
 	mov	esp, ebp
 .last_one:
 	mov	eax, [esp + 32]
 	inc	ebx
 	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; **********************************************************************
 ;
 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
 ; {
 ; 	unsigned i, j;
 ; 	FLAC__int32 sum;
 ;
 ; 	FLAC__ASSERT(order > 0);
 ;
 ; 	for(i = 0; i < data_len; i++) {
 ; 		sum = 0;
 ; 		for(j = 0; j < order; j++)
 ; 			sum += qlp_coeff[j] * data[i-j-1];
 ; 		data[i] = residual[i] + (sum >> lp_quantization);
 ; 	}
 ; }
 	ALIGN	16
 cident FLAC__lpc_restore_signal_asm_ia32
 	;[esp + 40]	data[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	residual[]

 	;ASSERT(order > 0)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	esi, [esp + 20]			; esi = residual[]
 	mov	edi, [esp + 40]			; edi = data[]
 	mov	eax, [esp + 32]			; eax = order
 	mov	ebx, [esp + 24]			; ebx = data_len

 	test	ebx, ebx
 	jz	near .end			; do nothing if data_len == 0

 .begin:
 	cmp	eax, byte 1
 	jg	short .x87_1more

 	mov	ecx, [esp + 28]
 	mov	edx, [ecx]
 	mov	eax, [edi - 4]
 	mov	ecx, [esp + 36]
 	ALIGN	16
 .x87_1_loop_i:
 	imul	eax, edx
 	sar	eax, cl
 	add	eax, [esi]
 	mov	[edi], eax
 	add	esi, byte 4
 	add	edi, byte 4
 	dec	ebx
 	jnz	.x87_1_loop_i

 	jmp	.end

 .x87_1more:
 	cmp	eax, byte 32			; for order <= 32 there is a faster routine
 	jbe	short .x87_32

 	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
 	ALIGN 16
 .x87_32more_loop_i:
 	xor	ebp, ebp
 	mov	ecx, [esp + 32]
 	mov	edx, ecx
 	shl	edx, 2
 	add	edx, [esp + 28]
 	neg	ecx
 	ALIGN	16
 .x87_32more_loop_j:
 	sub	edx, byte 4
 	mov	eax, [edx]
 	imul	eax, [edi + 4 * ecx]
 	add	ebp, eax
 	inc	ecx
 	jnz	short .x87_32more_loop_j

 	mov	ecx, [esp + 36]
 	sar	ebp, cl
 	add	ebp, [esi]
 	mov	[edi], ebp
 	add	edi, byte 4
 	add	esi, byte 4

 	dec	ebx
 	jnz	.x87_32more_loop_i

 	jmp	.end

 .mov_eip_to_eax:
 	mov	eax, [esp]
 	ret

 .x87_32:
 	sub	esi, edi
 	neg	eax
 	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
 	call	.mov_eip_to_eax
 .get_eip0:
 	add	edx, eax
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
 	xor	ebp, ebp
 	jmp	edx

 	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
 	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
 	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
 	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
 	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
 	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
 	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
 	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
 	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
 	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
 	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
 	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
 	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
 	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
 	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
 	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
 	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
 	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
 	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
 	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
 	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
 	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
 	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
 	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
 	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
 	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
 	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
 	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
 	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
 	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
 	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
 	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
 	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
 	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
 	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
 	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
 	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
 	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
 	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
 	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
 	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
 	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
 	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
 	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
 	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
 	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
 	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
 	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
 	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
 	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
 	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
 	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
 	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
 	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
 	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
 	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
 	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
 	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
 	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
 	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
 	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
 	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
 	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
 	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
 	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
 	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
 	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
 	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
 	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
 	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
 	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
 	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
 	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
 	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
 	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
 	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
 	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
 	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
 	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
 	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
 	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
 	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
 	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
 	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
 	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
 	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
 	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
 	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
 	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
 	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
 	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
 	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
 	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
 	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
 	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
 	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
 .jumper_0:

 	mov	ecx, [esp + 36]
 	sar	ebp, cl				; ebp = (sum >> lp_quantization)
 	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
 	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
 	add	edi, byte 4

 	dec	ebx
 	jz	short .end
 	xor	ebp, ebp
 	jmp	edx

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
 ; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
 ; cannot be used for side-channel coded 16bps channels since the effective bps
 ; is 17.
 ; WATCHOUT: this routine requires that each data array have a buffer of up to
 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
 	ALIGN	16
 cident FLAC__lpc_restore_signal_asm_ia32_mmx
 	;[esp + 40]	data[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	residual[]

 	;ASSERT(order > 0)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	esi, [esp + 20]
 	mov	edi, [esp + 40]
 	mov	eax, [esp + 32]
 	mov	ebx, [esp + 24]

 	test	ebx, ebx
 	jz	near .end			; do nothing if data_len == 0
 	cmp	eax, byte 4
 	jb	near FLAC__lpc_restore_signal_asm_ia32.begin

 	mov	edx, [esp + 28]
 	movd	mm6, [esp + 36]
 	mov	ebp, esp

 	and	esp, 0xfffffff8

 	xor	ecx, ecx
 .copy_qlp_loop:
 	push	word [edx + 4 * ecx]
 	inc	ecx
 	cmp	ecx, eax
 	jnz	short .copy_qlp_loop

 	and	ecx, 0x3
 	test	ecx, ecx
 	je	short .za_end
 	sub	ecx, byte 4
 .za_loop:
 	push	word 0
 	inc	eax
 	inc	ecx
 	jnz	short .za_loop
 .za_end:

 	movq	mm5, [esp + 2 * eax - 8]
 	movd	mm4, [edi - 16]
 	punpckldq	mm4, [edi - 12]
 	movd	mm0, [edi - 8]
 	punpckldq	mm0, [edi - 4]
 	packssdw	mm4, mm0

 	cmp	eax, byte 4
 	jnbe	short .mmx_4more

 	ALIGN	16
 .mmx_4_loop_i:
 	movq	mm7, mm4
 	pmaddwd	mm7, mm5
 	movq	mm0, mm7
 	punpckhdq	mm7, mm7
 	paddd	mm7, mm0
 	psrad	mm7, mm6
 	movd	mm1, [esi]
 	paddd	mm7, mm1
 	movd	[edi], mm7
 	psllq	mm7, 48
 	psrlq	mm4, 16
 	por	mm4, mm7

 	add	esi, byte 4
 	add	edi, byte 4

 	dec	ebx
 	jnz	.mmx_4_loop_i
 	jmp	.mmx_end
 .mmx_4more:
 	shl	eax, 2
 	neg	eax
 	add	eax, byte 16
 	ALIGN	16
 .mmx_4more_loop_i:
 	mov	ecx, edi
 	add	ecx, eax
 	mov	edx, esp

 	movq	mm7, mm4
 	pmaddwd	mm7, mm5

 	ALIGN	16
 .mmx_4more_loop_j:
 	movd	mm0, [ecx - 16]
 	punpckldq	mm0, [ecx - 12]
 	movd	mm1, [ecx - 8]
 	punpckldq	mm1, [ecx - 4]
 	packssdw	mm0, mm1
 	pmaddwd	mm0, [edx]
 	paddd	mm7, mm0

 	add	edx, byte 8
 	add	ecx, byte 16
 	cmp	ecx, edi
 	jnz	.mmx_4more_loop_j

 	movq	mm0, mm7
 	punpckhdq	mm7, mm7
 	paddd	mm7, mm0
 	psrad	mm7, mm6
 	movd	mm1, [esi]
 	paddd	mm7, mm1
 	movd	[edi], mm7
 	psllq	mm7, 48
 	psrlq	mm4, 16
 	por	mm4, mm7

 	add	esi, byte 4
 	add	edi, byte 4

 	dec	ebx
 	jnz	short .mmx_4more_loop_i
 .mmx_end:
 	emms
 	mov	esp, ebp

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret


 ; **********************************************************************
 ;
 ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 ; {
 ; 	unsigned i, j;
 ; 	FLAC__int64 sum;
 ;
 ; 	FLAC__ASSERT(order > 0);
 ;
 ;	for(i = 0; i < data_len; i++) {
 ;		sum = 0;
 ;		for(j = 0; j < order; j++)
 ;			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
 ;		residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
 ;	}
 ; }
 	ALIGN	16
 cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
 	;[esp + 40]	residual[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	data[]

 	;ASSERT(order > 0)
 	;ASSERT(order <= 32)
 	;ASSERT(lp_quantization <= 31)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	ebx, [esp + 24]			; ebx = data_len
 	test	ebx, ebx
 	jz	near .end				; do nothing if data_len == 0

 .begin:
 	mov	eax, [esp + 32]			; eax = order
 	cmp	eax, 1
 	jg	short .i_32

 	mov	esi, [esp + 40]			; esi = residual[]
 	mov	edi, [esp + 20]			; edi = data[]
 	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
 	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
 	mov	eax, [edi - 4]			; eax = data[-1]
 	mov	ecx, [esp + 36]			; cl = lp_quantization
 	ALIGN	16
 .i_1_loop_i:
 	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
 	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
 	neg	eax
 	add	eax, [edi]
 	mov	[esi], eax
 	mov	eax, [edi]
 	add	esi, 4
 	add	edi, 4
 	dec	ebx
 	jnz	.i_1_loop_i
 	jmp	.end

 .mov_eip_to_eax:
 	mov	eax, [esp]
 	ret

 .i_32:	; eax = order
 	neg	eax
 	add	eax, eax
 	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
 	call	.mov_eip_to_eax
 .get_eip0:
 	add	ebp, eax
 	inc	ebp				; compensate for the shorter opcode on the last iteration

 	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
 	mov	edi, [esp + 20]			; edi = data[]
 	sub	[esp + 40], edi			; residual[] -= data[]

 	xor	ecx, ecx
 	xor	esi, esi
 	jmp	ebp

 ;eax = --
 ;edx = --
 ;ecx = 0
 ;esi = 0
 ;
 ;ebx = qlp_coeff[]
 ;edi = data[]
 ;ebp = @address

 	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
 	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]

 	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
 	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]

 	mov	eax, [ebx + 116]
 	imul	dword [edi - 120]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 112]
 	imul	dword [edi - 116]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 108]
 	imul	dword [edi - 112]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 104]
 	imul	dword [edi - 108]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 100]
 	imul	dword [edi - 104]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 96]
 	imul	dword [edi - 100]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 92]
 	imul	dword [edi - 96]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 88]
 	imul	dword [edi - 92]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 84]
 	imul	dword [edi - 88]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 80]
 	imul	dword [edi - 84]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 76]
 	imul	dword [edi - 80]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 72]
 	imul	dword [edi - 76]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 68]
 	imul	dword [edi - 72]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 64]
 	imul	dword [edi - 68]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 60]
 	imul	dword [edi - 64]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 56]
 	imul	dword [edi - 60]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 52]
 	imul	dword [edi - 56]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 48]
 	imul	dword [edi - 52]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 44]
 	imul	dword [edi - 48]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 40]
 	imul	dword [edi - 44]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 36]
 	imul	dword [edi - 40]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 32]
 	imul	dword [edi - 36]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 28]
 	imul	dword [edi - 32]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 24]
 	imul	dword [edi - 28]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 20]
 	imul	dword [edi - 24]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 16]
 	imul	dword [edi - 20]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 12]
 	imul	dword [edi - 16]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 8]
 	imul	dword [edi - 12]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 4]
 	imul	dword [edi - 8]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
 	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]

 .jumper_0:
 	mov	edx, ecx
 ;esi:edx = sum
 	mov	ecx, [esp + 36]			; cl = lp_quantization
 	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
 ;eax = --
 ;ecx = --
 ;edx = sum >> lp_q
 ;esi = --
 	neg	edx						; edx = -(sum >> lp_quantization)
 	mov	eax, [esp + 40]			; residual[] - data[]
 	add	edx, [edi]				; edx = data[i] - (sum >> lp_quantization)
 	mov	[edi + eax], edx
 	add	edi, 4

 	dec	dword [esp + 24]
 	jz	short .end
 	xor	ecx, ecx
 	xor	esi, esi
 	jmp	ebp

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; **********************************************************************
 ;
 ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
 ; {
 ; 	unsigned i, j;
 ; 	FLAC__int64 sum;
 ;
 ; 	FLAC__ASSERT(order > 0);
 ;
 ; 	for(i = 0; i < data_len; i++) {
 ; 		sum = 0;
 ; 		for(j = 0; j < order; j++)
 ; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
 ; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
 ; 	}
 ; }
 	ALIGN	16
 cident FLAC__lpc_restore_signal_wide_asm_ia32
 	;[esp + 40]	data[]
 	;[esp + 36]	lp_quantization
 	;[esp + 32]	order
 	;[esp + 28]	qlp_coeff[]
 	;[esp + 24]	data_len
 	;[esp + 20]	residual[]

 	;ASSERT(order > 0)
 	;ASSERT(order <= 32)
 	;ASSERT(lp_quantization <= 31)

 	push	ebp
 	push	ebx
 	push	esi
 	push	edi

 	mov	ebx, [esp + 24]			; ebx = data_len
 	test	ebx, ebx
 	jz	near .end				; do nothing if data_len == 0

 .begin:
 	mov	eax, [esp + 32]			; eax = order
 	cmp	eax, 1
 	jg	short .x87_32

 	mov	esi, [esp + 20]			; esi = residual[]
 	mov	edi, [esp + 40]			; edi = data[]
 	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
 	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
 	mov	eax, [edi - 4]			; eax = data[-1]
 	mov	ecx, [esp + 36]			; cl = lp_quantization
 	ALIGN	16
 .x87_1_loop_i:
 	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
 	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
 ;
 	add	eax, [esi]
 	mov	[edi], eax
 ;
 	add	esi, 4
 	add	edi, 4
 	dec	ebx
 	jnz	.x87_1_loop_i
 	jmp	.end

 .mov_eip_to_eax:
 	mov	eax, [esp]
 	ret

 .x87_32:	; eax = order
 	neg	eax
 	add	eax, eax
 	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
 	call	.mov_eip_to_eax
 .get_eip0:
 	add	ebp, eax
 	inc	ebp				; compensate for the shorter opcode on the last iteration

 	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
 	mov	edi, [esp + 40]			; esi = data[]
 	sub	[esp + 20], edi			; residual[] -= data[]

 	xor	ecx, ecx
 	xor	esi, esi
 	jmp	ebp

 ;eax = --
 ;edx = --
 ;ecx = 0
 ;esi = 0
 ;
 ;ebx = qlp_coeff[]
 ;edi = data[]
 ;ebp = @address

 	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
 	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]

 	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
 	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]

 	mov	eax, [ebx + 116]
 	imul	dword [edi - 120]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 112]
 	imul	dword [edi - 116]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 108]
 	imul	dword [edi - 112]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 104]
 	imul	dword [edi - 108]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 100]
 	imul	dword [edi - 104]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 96]
 	imul	dword [edi - 100]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 92]
 	imul	dword [edi - 96]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 88]
 	imul	dword [edi - 92]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 84]
 	imul	dword [edi - 88]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 80]
 	imul	dword [edi - 84]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 76]
 	imul	dword [edi - 80]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 72]
 	imul	dword [edi - 76]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 68]
 	imul	dword [edi - 72]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 64]
 	imul	dword [edi - 68]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 60]
 	imul	dword [edi - 64]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 56]
 	imul	dword [edi - 60]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 52]
 	imul	dword [edi - 56]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 48]
 	imul	dword [edi - 52]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 44]
 	imul	dword [edi - 48]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 40]
 	imul	dword [edi - 44]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 36]
 	imul	dword [edi - 40]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 32]
 	imul	dword [edi - 36]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 28]
 	imul	dword [edi - 32]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 24]
 	imul	dword [edi - 28]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 20]
 	imul	dword [edi - 24]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 16]
 	imul	dword [edi - 20]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 12]
 	imul	dword [edi - 16]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 8]
 	imul	dword [edi - 12]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx + 4]
 	imul	dword [edi - 8]
 	add	ecx, eax
 	adc	esi, edx

 	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
 	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
 	add	ecx, eax
 	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]

 .jumper_0:
 	mov	edx, ecx
 ;esi:edx = sum
 	mov	ecx, [esp + 36]			; cl = lp_quantization
 	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
 ;eax = --
 ;ecx = --
 ;edx = sum >> lp_q
 ;esi = --
 ;
 	mov	eax, [esp + 20]			; residual[] - data[]
 	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
 	mov	[edi], edx				; data[i] = residual[i] + (sum >> lp_quantization)
 	add	edi, 4

 	dec	dword [esp + 24]
 	jz	short .end
 	xor	ecx, ecx
 	xor	esi, esi
 	jmp	ebp

 .end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 ; end