blob: 56dc2060a42c84c71f2297e2cde98666b1b0c728 [file] [log] [blame]
default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section .data data align=8
ALIGN 16
one:
DQ 1,0
two:
DQ 2,0
three:
DQ 3,0
four:
DQ 4,0
five:
DQ 5,0
six:
DQ 6,0
seven:
DQ 7,0
eight:
DQ 8,0
OR_MASK:
DD 0x00000000,0x00000000,0x00000000,0x80000000
poly:
DQ 0x1,0xc200000000000000
mask:
DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
con1:
DD 1,1,1,1
con2:
DD 0x1b,0x1b,0x1b,0x1b
con3:
DB -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
and_mask:
DD 0,0xffffffff,0xffffffff,0xffffffff
section .text code align=64
ALIGN 16
GFMUL:
vpclmulqdq xmm2,xmm0,xmm1,0x00
vpclmulqdq xmm5,xmm0,xmm1,0x11
vpclmulqdq xmm3,xmm0,xmm1,0x10
vpclmulqdq xmm4,xmm0,xmm1,0x01
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm3,8
vpsrldq xmm3,xmm3,8
vpxor xmm2,xmm2,xmm4
vpxor xmm5,xmm5,xmm3
vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
vpshufd xmm4,xmm2,78
vpxor xmm2,xmm3,xmm4
vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
vpshufd xmm4,xmm2,78
vpxor xmm2,xmm3,xmm4
vpxor xmm0,xmm2,xmm5
DB 0F3h,0C3h ;repret
global aesgcmsiv_htable_init
ALIGN 16
aesgcmsiv_htable_init:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aesgcmsiv_htable_init:
mov rdi,rcx
mov rsi,rdx
vmovdqa xmm0,XMMWORD[rsi]
vmovdqa xmm1,xmm0
vmovdqa XMMWORD[rdi],xmm0
call GFMUL
vmovdqa XMMWORD[16+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[32+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[48+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[64+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[80+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[96+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[112+rdi],xmm0
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aesgcmsiv_htable_init:
global aesgcmsiv_htable6_init
ALIGN 16
aesgcmsiv_htable6_init:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aesgcmsiv_htable6_init:
mov rdi,rcx
mov rsi,rdx
vmovdqa xmm0,XMMWORD[rsi]
vmovdqa xmm1,xmm0
vmovdqa XMMWORD[rdi],xmm0
call GFMUL
vmovdqa XMMWORD[16+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[32+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[48+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[64+rdi],xmm0
call GFMUL
vmovdqa XMMWORD[80+rdi],xmm0
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aesgcmsiv_htable6_init:
global aesgcmsiv_htable_polyval
ALIGN 16
aesgcmsiv_htable_polyval:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aesgcmsiv_htable_polyval:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
test rdx,rdx
jnz NEAR $L$htable_polyval_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$htable_polyval_start:
vzeroall
mov r11,rdx
and r11,127
jz NEAR $L$htable_polyval_no_prefix
vpxor xmm9,xmm9,xmm9
vmovdqa xmm1,XMMWORD[rcx]
sub rdx,r11
sub r11,16
vmovdqu xmm0,XMMWORD[rsi]
vpxor xmm0,xmm0,xmm1
vpclmulqdq xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
vpclmulqdq xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
vpclmulqdq xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
vpxor xmm5,xmm5,xmm6
lea rsi,[16+rsi]
test r11,r11
jnz NEAR $L$htable_polyval_prefix_loop
jmp NEAR $L$htable_polyval_prefix_complete
ALIGN 64
$L$htable_polyval_prefix_loop:
sub r11,16
vmovdqu xmm0,XMMWORD[rsi]
vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
vpxor xmm5,xmm5,xmm6
test r11,r11
lea rsi,[16+rsi]
jnz NEAR $L$htable_polyval_prefix_loop
$L$htable_polyval_prefix_complete:
vpsrldq xmm6,xmm5,8
vpslldq xmm5,xmm5,8
vpxor xmm9,xmm4,xmm6
vpxor xmm1,xmm3,xmm5
jmp NEAR $L$htable_polyval_main_loop
$L$htable_polyval_no_prefix:
vpxor xmm1,xmm1,xmm1
vmovdqa xmm9,XMMWORD[rcx]
ALIGN 64
$L$htable_polyval_main_loop:
sub rdx,0x80
jb NEAR $L$htable_polyval_out
vmovdqu xmm0,XMMWORD[112+rsi]
vpclmulqdq xmm5,xmm0,XMMWORD[rdi],0x01
vpclmulqdq xmm3,xmm0,XMMWORD[rdi],0x00
vpclmulqdq xmm4,xmm0,XMMWORD[rdi],0x11
vpclmulqdq xmm6,xmm0,XMMWORD[rdi],0x10
vpxor xmm5,xmm5,xmm6
vmovdqu xmm0,XMMWORD[96+rsi]
vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x10
vpxor xmm5,xmm5,xmm6
vmovdqu xmm0,XMMWORD[80+rsi]
vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
vpalignr xmm1,xmm1,xmm1,8
vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x10
vpxor xmm5,xmm5,xmm6
vpxor xmm1,xmm1,xmm7
vmovdqu xmm0,XMMWORD[64+rsi]
vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x10
vpxor xmm5,xmm5,xmm6
vmovdqu xmm0,XMMWORD[48+rsi]
vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
vpalignr xmm1,xmm1,xmm1,8
vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x10
vpxor xmm5,xmm5,xmm6
vpxor xmm1,xmm1,xmm7
vmovdqu xmm0,XMMWORD[32+rsi]
vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x10
vpxor xmm5,xmm5,xmm6
vpxor xmm1,xmm1,xmm9
vmovdqu xmm0,XMMWORD[16+rsi]
vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x10
vpxor xmm5,xmm5,xmm6
vmovdqu xmm0,XMMWORD[rsi]
vpxor xmm0,xmm0,xmm1
vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x01
vpxor xmm5,xmm5,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x00
vpxor xmm3,xmm3,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x11
vpxor xmm4,xmm4,xmm6
vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x10
vpxor xmm5,xmm5,xmm6
vpsrldq xmm6,xmm5,8
vpslldq xmm5,xmm5,8
vpxor xmm9,xmm4,xmm6
vpxor xmm1,xmm3,xmm5
lea rsi,[128+rsi]
jmp NEAR $L$htable_polyval_main_loop
$L$htable_polyval_out:
vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
vpalignr xmm1,xmm1,xmm1,8
vpxor xmm1,xmm1,xmm6
vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
vpalignr xmm1,xmm1,xmm1,8
vpxor xmm1,xmm1,xmm6
vpxor xmm1,xmm1,xmm9
vmovdqu XMMWORD[rcx],xmm1
vzeroupper
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aesgcmsiv_htable_polyval:
global aesgcmsiv_polyval_horner
ALIGN 16
aesgcmsiv_polyval_horner:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aesgcmsiv_polyval_horner:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
test rcx,rcx
jnz NEAR $L$polyval_horner_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$polyval_horner_start:
xor r10,r10
shl rcx,4
vmovdqa xmm1,XMMWORD[rsi]
vmovdqa xmm0,XMMWORD[rdi]
$L$polyval_horner_loop:
vpxor xmm0,xmm0,XMMWORD[r10*1+rdx]
call GFMUL
add r10,16
cmp rcx,r10
jne NEAR $L$polyval_horner_loop
vmovdqa XMMWORD[rdi],xmm0
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aesgcmsiv_polyval_horner:
global aes128gcmsiv_aes_ks
ALIGN 16
aes128gcmsiv_aes_ks:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_aes_ks:
mov rdi,rcx
mov rsi,rdx
vmovdqu xmm1,XMMWORD[rdi]
vmovdqa XMMWORD[rsi],xmm1
vmovdqa xmm0,XMMWORD[con1]
vmovdqa xmm15,XMMWORD[mask]
mov rax,8
$L$ks128_loop:
add rsi,16
sub rax,1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm3,xmm1,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vmovdqa XMMWORD[rsi],xmm1
jne NEAR $L$ks128_loop
vmovdqa xmm0,XMMWORD[con2]
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm3,xmm1,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vmovdqa XMMWORD[16+rsi],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslldq xmm3,xmm1,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpslldq xmm3,xmm3,4
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vmovdqa XMMWORD[32+rsi],xmm1
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_aes_ks:
global aes256gcmsiv_aes_ks
ALIGN 16
aes256gcmsiv_aes_ks:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_aes_ks:
mov rdi,rcx
mov rsi,rdx
vmovdqu xmm1,XMMWORD[rdi]
vmovdqu xmm3,XMMWORD[16+rdi]
vmovdqa XMMWORD[rsi],xmm1
vmovdqa XMMWORD[16+rsi],xmm3
vmovdqa xmm0,XMMWORD[con1]
vmovdqa xmm15,XMMWORD[mask]
vpxor xmm14,xmm14,xmm14
mov rax,6
$L$ks256_loop:
add rsi,32
sub rax,1
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm4,xmm1,32
vpxor xmm1,xmm1,xmm4
vpshufb xmm4,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vmovdqa XMMWORD[rsi],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpsllq xmm4,xmm3,32
vpxor xmm3,xmm3,xmm4
vpshufb xmm4,xmm3,XMMWORD[con3]
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vmovdqa XMMWORD[16+rsi],xmm3
jne NEAR $L$ks256_loop
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpsllq xmm4,xmm1,32
vpxor xmm1,xmm1,xmm4
vpshufb xmm4,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vmovdqa XMMWORD[32+rsi],xmm1
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
global aes128gcmsiv_aes_ks_enc_x1
ALIGN 16
aes128gcmsiv_aes_ks_enc_x1:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
vmovdqa xmm1,XMMWORD[rcx]
vmovdqa xmm4,XMMWORD[rdi]
vmovdqa XMMWORD[rdx],xmm1
vpxor xmm4,xmm4,xmm1
vmovdqa xmm0,XMMWORD[con1]
vmovdqa xmm15,XMMWORD[mask]
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[16+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[32+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[48+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[64+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[80+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[96+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[112+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[128+rdx],xmm1
vmovdqa xmm0,XMMWORD[con2]
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenc xmm4,xmm4,xmm1
vmovdqa XMMWORD[144+rdx],xmm1
vpshufb xmm2,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm0
vpsllq xmm3,xmm1,32
vpxor xmm1,xmm1,xmm3
vpshufb xmm3,xmm1,XMMWORD[con3]
vpxor xmm1,xmm1,xmm3
vpxor xmm1,xmm1,xmm2
vaesenclast xmm4,xmm4,xmm1
vmovdqa XMMWORD[160+rdx],xmm1
vmovdqa XMMWORD[rsi],xmm4
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
global aes128gcmsiv_kdf
ALIGN 16
aes128gcmsiv_kdf:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_kdf:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
vmovdqa xmm1,XMMWORD[rdx]
vmovdqa xmm9,XMMWORD[rdi]
vmovdqa xmm12,XMMWORD[and_mask]
vmovdqa xmm13,XMMWORD[one]
vpshufd xmm9,xmm9,0x90
vpand xmm9,xmm9,xmm12
vpaddd xmm10,xmm9,xmm13
vpaddd xmm11,xmm10,xmm13
vpaddd xmm12,xmm11,xmm13
vpxor xmm9,xmm9,xmm1
vpxor xmm10,xmm10,xmm1
vpxor xmm11,xmm11,xmm1
vpxor xmm12,xmm12,xmm1
vmovdqa xmm1,XMMWORD[16+rdx]
vaesenc xmm9,xmm9,xmm1
vaesenc xmm10,xmm10,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vmovdqa xmm2,XMMWORD[32+rdx]
vaesenc xmm9,xmm9,xmm2
vaesenc xmm10,xmm10,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vmovdqa xmm1,XMMWORD[48+rdx]
vaesenc xmm9,xmm9,xmm1
vaesenc xmm10,xmm10,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vmovdqa xmm2,XMMWORD[64+rdx]
vaesenc xmm9,xmm9,xmm2
vaesenc xmm10,xmm10,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vmovdqa xmm1,XMMWORD[80+rdx]
vaesenc xmm9,xmm9,xmm1
vaesenc xmm10,xmm10,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vmovdqa xmm2,XMMWORD[96+rdx]
vaesenc xmm9,xmm9,xmm2
vaesenc xmm10,xmm10,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vmovdqa xmm1,XMMWORD[112+rdx]
vaesenc xmm9,xmm9,xmm1
vaesenc xmm10,xmm10,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vmovdqa xmm2,XMMWORD[128+rdx]
vaesenc xmm9,xmm9,xmm2
vaesenc xmm10,xmm10,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vmovdqa xmm1,XMMWORD[144+rdx]
vaesenc xmm9,xmm9,xmm1
vaesenc xmm10,xmm10,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vmovdqa xmm2,XMMWORD[160+rdx]
vaesenclast xmm9,xmm9,xmm2
vaesenclast xmm10,xmm10,xmm2
vaesenclast xmm11,xmm11,xmm2
vaesenclast xmm12,xmm12,xmm2
vmovdqa XMMWORD[rsi],xmm9
vmovdqa XMMWORD[16+rsi],xmm10
vmovdqa XMMWORD[32+rsi],xmm11
vmovdqa XMMWORD[48+rsi],xmm12
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_kdf:
global aes128gcmsiv_enc_msg_x4
ALIGN 16
aes128gcmsiv_enc_msg_x4:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
test r8,r8
jnz NEAR $L$128_enc_msg_x4_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$128_enc_msg_x4_start:
push r12
push r13
shr r8,4
mov r10,r8
shl r10,62
shr r10,62
vmovdqa xmm15,XMMWORD[rdx]
vpor xmm15,xmm15,XMMWORD[OR_MASK]
vmovdqu xmm4,XMMWORD[four]
vmovdqa xmm0,xmm15
vpaddd xmm1,xmm15,XMMWORD[one]
vpaddd xmm2,xmm15,XMMWORD[two]
vpaddd xmm3,xmm15,XMMWORD[three]
shr r8,2
je NEAR $L$128_enc_msg_x4_check_remainder
sub rsi,64
sub rdi,64
$L$128_enc_msg_x4_loop1:
add rsi,64
add rdi,64
vmovdqa xmm5,xmm0
vmovdqa xmm6,xmm1
vmovdqa xmm7,xmm2
vmovdqa xmm8,xmm3
vpxor xmm5,xmm5,XMMWORD[rcx]
vpxor xmm6,xmm6,XMMWORD[rcx]
vpxor xmm7,xmm7,XMMWORD[rcx]
vpxor xmm8,xmm8,XMMWORD[rcx]
vmovdqu xmm12,XMMWORD[16+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm0,xmm0,xmm4
vmovdqu xmm12,XMMWORD[32+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm1,xmm1,xmm4
vmovdqu xmm12,XMMWORD[48+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm2,xmm2,xmm4
vmovdqu xmm12,XMMWORD[64+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm3,xmm3,xmm4
vmovdqu xmm12,XMMWORD[80+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[96+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[112+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[128+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[144+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[160+rcx]
vaesenclast xmm5,xmm5,xmm12
vaesenclast xmm6,xmm6,xmm12
vaesenclast xmm7,xmm7,xmm12
vaesenclast xmm8,xmm8,xmm12
vpxor xmm5,xmm5,XMMWORD[rdi]
vpxor xmm6,xmm6,XMMWORD[16+rdi]
vpxor xmm7,xmm7,XMMWORD[32+rdi]
vpxor xmm8,xmm8,XMMWORD[48+rdi]
sub r8,1
vmovdqu XMMWORD[rsi],xmm5
vmovdqu XMMWORD[16+rsi],xmm6
vmovdqu XMMWORD[32+rsi],xmm7
vmovdqu XMMWORD[48+rsi],xmm8
jne NEAR $L$128_enc_msg_x4_loop1
add rsi,64
add rdi,64
$L$128_enc_msg_x4_check_remainder:
cmp r10,0
je NEAR $L$128_enc_msg_x4_out
$L$128_enc_msg_x4_loop2:
vmovdqa xmm5,xmm0
vpaddd xmm0,xmm0,XMMWORD[one]
vpxor xmm5,xmm5,XMMWORD[rcx]
vaesenc xmm5,xmm5,XMMWORD[16+rcx]
vaesenc xmm5,xmm5,XMMWORD[32+rcx]
vaesenc xmm5,xmm5,XMMWORD[48+rcx]
vaesenc xmm5,xmm5,XMMWORD[64+rcx]
vaesenc xmm5,xmm5,XMMWORD[80+rcx]
vaesenc xmm5,xmm5,XMMWORD[96+rcx]
vaesenc xmm5,xmm5,XMMWORD[112+rcx]
vaesenc xmm5,xmm5,XMMWORD[128+rcx]
vaesenc xmm5,xmm5,XMMWORD[144+rcx]
vaesenclast xmm5,xmm5,XMMWORD[160+rcx]
vpxor xmm5,xmm5,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm5
add rdi,16
add rsi,16
sub r10,1
jne NEAR $L$128_enc_msg_x4_loop2
$L$128_enc_msg_x4_out:
pop r13
pop r12
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_enc_msg_x4:
global aes128gcmsiv_enc_msg_x8
ALIGN 16
aes128gcmsiv_enc_msg_x8:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
test r8,r8
jnz NEAR $L$128_enc_msg_x8_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$128_enc_msg_x8_start:
push r12
push r13
push rbp
mov rbp,rsp
sub rsp,128
and rsp,-64
shr r8,4
mov r10,r8
shl r10,61
shr r10,61
vmovdqu xmm1,XMMWORD[rdx]
vpor xmm1,xmm1,XMMWORD[OR_MASK]
vpaddd xmm0,xmm1,XMMWORD[seven]
vmovdqu XMMWORD[rsp],xmm0
vpaddd xmm9,xmm1,XMMWORD[one]
vpaddd xmm10,xmm1,XMMWORD[two]
vpaddd xmm11,xmm1,XMMWORD[three]
vpaddd xmm12,xmm1,XMMWORD[four]
vpaddd xmm13,xmm1,XMMWORD[five]
vpaddd xmm14,xmm1,XMMWORD[six]
vmovdqa xmm0,xmm1
shr r8,3
je NEAR $L$128_enc_msg_x8_check_remainder
sub rsi,128
sub rdi,128
$L$128_enc_msg_x8_loop1:
add rsi,128
add rdi,128
vmovdqa xmm1,xmm0
vmovdqa xmm2,xmm9
vmovdqa xmm3,xmm10
vmovdqa xmm4,xmm11
vmovdqa xmm5,xmm12
vmovdqa xmm6,xmm13
vmovdqa xmm7,xmm14
vmovdqu xmm8,XMMWORD[rsp]
vpxor xmm1,xmm1,XMMWORD[rcx]
vpxor xmm2,xmm2,XMMWORD[rcx]
vpxor xmm3,xmm3,XMMWORD[rcx]
vpxor xmm4,xmm4,XMMWORD[rcx]
vpxor xmm5,xmm5,XMMWORD[rcx]
vpxor xmm6,xmm6,XMMWORD[rcx]
vpxor xmm7,xmm7,XMMWORD[rcx]
vpxor xmm8,xmm8,XMMWORD[rcx]
vmovdqu xmm15,XMMWORD[16+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm14,XMMWORD[rsp]
vpaddd xmm14,xmm14,XMMWORD[eight]
vmovdqu XMMWORD[rsp],xmm14
vmovdqu xmm15,XMMWORD[32+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpsubd xmm14,xmm14,XMMWORD[one]
vmovdqu xmm15,XMMWORD[48+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm0,xmm0,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[64+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm9,xmm9,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[80+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm10,xmm10,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[96+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm11,xmm11,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[112+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm12,xmm12,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[128+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm13,xmm13,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[144+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[160+rcx]
vaesenclast xmm1,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm15
vaesenclast xmm3,xmm3,xmm15
vaesenclast xmm4,xmm4,xmm15
vaesenclast xmm5,xmm5,xmm15
vaesenclast xmm6,xmm6,xmm15
vaesenclast xmm7,xmm7,xmm15
vaesenclast xmm8,xmm8,xmm15
vpxor xmm1,xmm1,XMMWORD[rdi]
vpxor xmm2,xmm2,XMMWORD[16+rdi]
vpxor xmm3,xmm3,XMMWORD[32+rdi]
vpxor xmm4,xmm4,XMMWORD[48+rdi]
vpxor xmm5,xmm5,XMMWORD[64+rdi]
vpxor xmm6,xmm6,XMMWORD[80+rdi]
vpxor xmm7,xmm7,XMMWORD[96+rdi]
vpxor xmm8,xmm8,XMMWORD[112+rdi]
dec r8
vmovdqu XMMWORD[rsi],xmm1
vmovdqu XMMWORD[16+rsi],xmm2
vmovdqu XMMWORD[32+rsi],xmm3
vmovdqu XMMWORD[48+rsi],xmm4
vmovdqu XMMWORD[64+rsi],xmm5
vmovdqu XMMWORD[80+rsi],xmm6
vmovdqu XMMWORD[96+rsi],xmm7
vmovdqu XMMWORD[112+rsi],xmm8
jne NEAR $L$128_enc_msg_x8_loop1
add rsi,128
add rdi,128
$L$128_enc_msg_x8_check_remainder:
cmp r10,0
je NEAR $L$128_enc_msg_x8_out
$L$128_enc_msg_x8_loop2:
vmovdqa xmm1,xmm0
vpaddd xmm0,xmm0,XMMWORD[one]
vpxor xmm1,xmm1,XMMWORD[rcx]
vaesenc xmm1,xmm1,XMMWORD[16+rcx]
vaesenc xmm1,xmm1,XMMWORD[32+rcx]
vaesenc xmm1,xmm1,XMMWORD[48+rcx]
vaesenc xmm1,xmm1,XMMWORD[64+rcx]
vaesenc xmm1,xmm1,XMMWORD[80+rcx]
vaesenc xmm1,xmm1,XMMWORD[96+rcx]
vaesenc xmm1,xmm1,XMMWORD[112+rcx]
vaesenc xmm1,xmm1,XMMWORD[128+rcx]
vaesenc xmm1,xmm1,XMMWORD[144+rcx]
vaesenclast xmm1,xmm1,XMMWORD[160+rcx]
vpxor xmm1,xmm1,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm1
add rdi,16
add rsi,16
dec r10
jne NEAR $L$128_enc_msg_x8_loop2
$L$128_enc_msg_x8_out:
mov rsp,rbp
pop rbp
pop r13
pop r12
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_enc_msg_x8:
global aes128gcmsiv_dec
ALIGN 16
aes128gcmsiv_dec:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_dec:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
mov r9,QWORD[48+rsp]
test r9,~15
jnz NEAR $L$128_dec_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$128_dec_start:
vzeroupper
vmovdqa xmm0,XMMWORD[rdx]
mov rax,rdx
lea rax,[32+rax]
lea rcx,[32+rcx]
vmovdqu xmm15,XMMWORD[r9*1+rdi]
vpor xmm15,xmm15,XMMWORD[OR_MASK]
and r9,~15
cmp r9,96
jb NEAR $L$128_dec_loop2
sub r9,96
vmovdqa xmm7,xmm15
vpaddd xmm8,xmm7,XMMWORD[one]
vpaddd xmm9,xmm7,XMMWORD[two]
vpaddd xmm10,xmm9,XMMWORD[one]
vpaddd xmm11,xmm9,XMMWORD[two]
vpaddd xmm12,xmm11,XMMWORD[one]
vpaddd xmm15,xmm11,XMMWORD[two]
vpxor xmm7,xmm7,XMMWORD[r8]
vpxor xmm8,xmm8,XMMWORD[r8]
vpxor xmm9,xmm9,XMMWORD[r8]
vpxor xmm10,xmm10,XMMWORD[r8]
vpxor xmm11,xmm11,XMMWORD[r8]
vpxor xmm12,xmm12,XMMWORD[r8]
vmovdqu xmm4,XMMWORD[16+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[32+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[48+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[64+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[80+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[96+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[112+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[128+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[144+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[160+r8]
vaesenclast xmm7,xmm7,xmm4
vaesenclast xmm8,xmm8,xmm4
vaesenclast xmm9,xmm9,xmm4
vaesenclast xmm10,xmm10,xmm4
vaesenclast xmm11,xmm11,xmm4
vaesenclast xmm12,xmm12,xmm4
vpxor xmm7,xmm7,XMMWORD[rdi]
vpxor xmm8,xmm8,XMMWORD[16+rdi]
vpxor xmm9,xmm9,XMMWORD[32+rdi]
vpxor xmm10,xmm10,XMMWORD[48+rdi]
vpxor xmm11,xmm11,XMMWORD[64+rdi]
vpxor xmm12,xmm12,XMMWORD[80+rdi]
vmovdqu XMMWORD[rsi],xmm7
vmovdqu XMMWORD[16+rsi],xmm8
vmovdqu XMMWORD[32+rsi],xmm9
vmovdqu XMMWORD[48+rsi],xmm10
vmovdqu XMMWORD[64+rsi],xmm11
vmovdqu XMMWORD[80+rsi],xmm12
add rdi,96
add rsi,96
jmp NEAR $L$128_dec_loop1
ALIGN 64
$L$128_dec_loop1:
cmp r9,96
jb NEAR $L$128_dec_finish_96
sub r9,96
vmovdqa xmm6,xmm12
vmovdqa XMMWORD[(16-32)+rax],xmm11
vmovdqa XMMWORD[(32-32)+rax],xmm10
vmovdqa XMMWORD[(48-32)+rax],xmm9
vmovdqa XMMWORD[(64-32)+rax],xmm8
vmovdqa XMMWORD[(80-32)+rax],xmm7
vmovdqa xmm7,xmm15
vpaddd xmm8,xmm7,XMMWORD[one]
vpaddd xmm9,xmm7,XMMWORD[two]
vpaddd xmm10,xmm9,XMMWORD[one]
vpaddd xmm11,xmm9,XMMWORD[two]
vpaddd xmm12,xmm11,XMMWORD[one]
vpaddd xmm15,xmm11,XMMWORD[two]
vmovdqa xmm4,XMMWORD[r8]
vpxor xmm7,xmm7,xmm4
vpxor xmm8,xmm8,xmm4
vpxor xmm9,xmm9,xmm4
vpxor xmm10,xmm10,xmm4
vpxor xmm11,xmm11,xmm4
vpxor xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[((0-32))+rcx]
vpclmulqdq xmm2,xmm6,xmm4,0x11
vpclmulqdq xmm3,xmm6,xmm4,0x00
vpclmulqdq xmm1,xmm6,xmm4,0x01
vpclmulqdq xmm4,xmm6,xmm4,0x10
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[16+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[((-16))+rax]
vmovdqu xmm13,XMMWORD[((-16))+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[32+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[rax]
vmovdqu xmm13,XMMWORD[rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[48+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[16+rax]
vmovdqu xmm13,XMMWORD[16+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[64+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[32+rax]
vmovdqu xmm13,XMMWORD[32+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[80+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[96+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[112+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqa xmm6,XMMWORD[((80-32))+rax]
vpxor xmm6,xmm6,xmm0
vmovdqu xmm5,XMMWORD[((80-32))+rcx]
vpclmulqdq xmm4,xmm6,xmm5,0x01
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x10
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[128+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vpsrldq xmm4,xmm1,8
vpxor xmm5,xmm2,xmm4
vpslldq xmm4,xmm1,8
vpxor xmm0,xmm3,xmm4
vmovdqa xmm3,XMMWORD[poly]
vmovdqu xmm4,XMMWORD[144+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[160+r8]
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpxor xmm4,xmm6,XMMWORD[rdi]
vaesenclast xmm7,xmm7,xmm4
vpxor xmm4,xmm6,XMMWORD[16+rdi]
vaesenclast xmm8,xmm8,xmm4
vpxor xmm4,xmm6,XMMWORD[32+rdi]
vaesenclast xmm9,xmm9,xmm4
vpxor xmm4,xmm6,XMMWORD[48+rdi]
vaesenclast xmm10,xmm10,xmm4
vpxor xmm4,xmm6,XMMWORD[64+rdi]
vaesenclast xmm11,xmm11,xmm4
vpxor xmm4,xmm6,XMMWORD[80+rdi]
vaesenclast xmm12,xmm12,xmm4
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vmovdqu XMMWORD[rsi],xmm7
vmovdqu XMMWORD[16+rsi],xmm8
vmovdqu XMMWORD[32+rsi],xmm9
vmovdqu XMMWORD[48+rsi],xmm10
vmovdqu XMMWORD[64+rsi],xmm11
vmovdqu XMMWORD[80+rsi],xmm12
vpxor xmm0,xmm0,xmm5
lea rdi,[96+rdi]
lea rsi,[96+rsi]
jmp NEAR $L$128_dec_loop1
$L$128_dec_finish_96:
vmovdqa xmm6,xmm12
vmovdqa XMMWORD[(16-32)+rax],xmm11
vmovdqa XMMWORD[(32-32)+rax],xmm10
vmovdqa XMMWORD[(48-32)+rax],xmm9
vmovdqa XMMWORD[(64-32)+rax],xmm8
vmovdqa XMMWORD[(80-32)+rax],xmm7
vmovdqu xmm4,XMMWORD[((0-32))+rcx]
vpclmulqdq xmm1,xmm6,xmm4,0x10
vpclmulqdq xmm2,xmm6,xmm4,0x11
vpclmulqdq xmm3,xmm6,xmm4,0x00
vpclmulqdq xmm4,xmm6,xmm4,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[((-16))+rax]
vmovdqu xmm13,XMMWORD[((-16))+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[rax]
vmovdqu xmm13,XMMWORD[rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[16+rax]
vmovdqu xmm13,XMMWORD[16+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[32+rax]
vmovdqu xmm13,XMMWORD[32+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[((80-32))+rax]
vpxor xmm6,xmm6,xmm0
vmovdqu xmm5,XMMWORD[((80-32))+rcx]
vpclmulqdq xmm4,xmm6,xmm5,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x01
vpxor xmm1,xmm1,xmm4
vpsrldq xmm4,xmm1,8
vpxor xmm5,xmm2,xmm4
vpslldq xmm4,xmm1,8
vpxor xmm0,xmm3,xmm4
vmovdqa xmm3,XMMWORD[poly]
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpxor xmm0,xmm0,xmm5
$L$128_dec_loop2:
cmp r9,16
jb NEAR $L$128_dec_out
sub r9,16
vmovdqa xmm2,xmm15
vpaddd xmm15,xmm15,XMMWORD[one]
vpxor xmm2,xmm2,XMMWORD[r8]
vaesenc xmm2,xmm2,XMMWORD[16+r8]
vaesenc xmm2,xmm2,XMMWORD[32+r8]
vaesenc xmm2,xmm2,XMMWORD[48+r8]
vaesenc xmm2,xmm2,XMMWORD[64+r8]
vaesenc xmm2,xmm2,XMMWORD[80+r8]
vaesenc xmm2,xmm2,XMMWORD[96+r8]
vaesenc xmm2,xmm2,XMMWORD[112+r8]
vaesenc xmm2,xmm2,XMMWORD[128+r8]
vaesenc xmm2,xmm2,XMMWORD[144+r8]
vaesenclast xmm2,xmm2,XMMWORD[160+r8]
vpxor xmm2,xmm2,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm2
add rdi,16
add rsi,16
vpxor xmm0,xmm0,xmm2
vmovdqa xmm1,XMMWORD[((-32))+rcx]
call GFMUL
jmp NEAR $L$128_dec_loop2
$L$128_dec_out:
vmovdqu XMMWORD[rdx],xmm0
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_dec:
global aes128gcmsiv_ecb_enc_block
ALIGN 16
aes128gcmsiv_ecb_enc_block:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
vmovdqa xmm1,XMMWORD[rdi]
vpxor xmm1,xmm1,XMMWORD[rdx]
vaesenc xmm1,xmm1,XMMWORD[16+rdx]
vaesenc xmm1,xmm1,XMMWORD[32+rdx]
vaesenc xmm1,xmm1,XMMWORD[48+rdx]
vaesenc xmm1,xmm1,XMMWORD[64+rdx]
vaesenc xmm1,xmm1,XMMWORD[80+rdx]
vaesenc xmm1,xmm1,XMMWORD[96+rdx]
vaesenc xmm1,xmm1,XMMWORD[112+rdx]
vaesenc xmm1,xmm1,XMMWORD[128+rdx]
vaesenc xmm1,xmm1,XMMWORD[144+rdx]
vaesenclast xmm1,xmm1,XMMWORD[160+rdx]
vmovdqa XMMWORD[rsi],xmm1
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes128gcmsiv_ecb_enc_block:
global aes256gcmsiv_aes_ks_enc_x1
ALIGN 16
aes256gcmsiv_aes_ks_enc_x1:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
vmovdqa xmm0,XMMWORD[con1]
vmovdqa xmm15,XMMWORD[mask]
vmovdqa xmm8,XMMWORD[rdi]
vmovdqa xmm1,XMMWORD[rcx]
vmovdqa xmm3,XMMWORD[16+rcx]
vpxor xmm8,xmm8,xmm1
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[rdx],xmm1
vmovdqu XMMWORD[16+rdx],xmm3
vpxor xmm14,xmm14,xmm14
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[32+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[48+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[64+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[80+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[96+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[112+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[128+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[144+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[160+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[176+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslld xmm0,xmm0,1
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenc xmm8,xmm8,xmm1
vmovdqu XMMWORD[192+rdx],xmm1
vpshufd xmm2,xmm1,0xff
vaesenclast xmm2,xmm2,xmm14
vpslldq xmm4,xmm3,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm3,xmm3,xmm4
vpxor xmm3,xmm3,xmm2
vaesenc xmm8,xmm8,xmm3
vmovdqu XMMWORD[208+rdx],xmm3
vpshufb xmm2,xmm3,xmm15
vaesenclast xmm2,xmm2,xmm0
vpslldq xmm4,xmm1,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpslldq xmm4,xmm4,4
vpxor xmm1,xmm1,xmm4
vpxor xmm1,xmm1,xmm2
vaesenclast xmm8,xmm8,xmm1
vmovdqu XMMWORD[224+rdx],xmm1
vmovdqa XMMWORD[rsi],xmm8
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
global aes256gcmsiv_ecb_enc_block
ALIGN 16
aes256gcmsiv_ecb_enc_block:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
vmovdqa xmm1,XMMWORD[rdi]
vpxor xmm1,xmm1,XMMWORD[rdx]
vaesenc xmm1,xmm1,XMMWORD[16+rdx]
vaesenc xmm1,xmm1,XMMWORD[32+rdx]
vaesenc xmm1,xmm1,XMMWORD[48+rdx]
vaesenc xmm1,xmm1,XMMWORD[64+rdx]
vaesenc xmm1,xmm1,XMMWORD[80+rdx]
vaesenc xmm1,xmm1,XMMWORD[96+rdx]
vaesenc xmm1,xmm1,XMMWORD[112+rdx]
vaesenc xmm1,xmm1,XMMWORD[128+rdx]
vaesenc xmm1,xmm1,XMMWORD[144+rdx]
vaesenc xmm1,xmm1,XMMWORD[160+rdx]
vaesenc xmm1,xmm1,XMMWORD[176+rdx]
vaesenc xmm1,xmm1,XMMWORD[192+rdx]
vaesenc xmm1,xmm1,XMMWORD[208+rdx]
vaesenclast xmm1,xmm1,XMMWORD[224+rdx]
vmovdqa XMMWORD[rsi],xmm1
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes256gcmsiv_ecb_enc_block:
global aes256gcmsiv_enc_msg_x4
ALIGN 16
aes256gcmsiv_enc_msg_x4:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
test r8,r8
jnz NEAR $L$256_enc_msg_x4_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$256_enc_msg_x4_start:
mov r10,r8
shr r8,4
shl r10,60
jz NEAR $L$256_enc_msg_x4_start2
add r8,1
$L$256_enc_msg_x4_start2:
mov r10,r8
shl r10,62
shr r10,62
vmovdqa xmm15,XMMWORD[rdx]
vpor xmm15,xmm15,XMMWORD[OR_MASK]
vmovdqa xmm4,XMMWORD[four]
vmovdqa xmm0,xmm15
vpaddd xmm1,xmm15,XMMWORD[one]
vpaddd xmm2,xmm15,XMMWORD[two]
vpaddd xmm3,xmm15,XMMWORD[three]
shr r8,2
je NEAR $L$256_enc_msg_x4_check_remainder
sub rsi,64
sub rdi,64
$L$256_enc_msg_x4_loop1:
add rsi,64
add rdi,64
vmovdqa xmm5,xmm0
vmovdqa xmm6,xmm1
vmovdqa xmm7,xmm2
vmovdqa xmm8,xmm3
vpxor xmm5,xmm5,XMMWORD[rcx]
vpxor xmm6,xmm6,XMMWORD[rcx]
vpxor xmm7,xmm7,XMMWORD[rcx]
vpxor xmm8,xmm8,XMMWORD[rcx]
vmovdqu xmm12,XMMWORD[16+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm0,xmm0,xmm4
vmovdqu xmm12,XMMWORD[32+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm1,xmm1,xmm4
vmovdqu xmm12,XMMWORD[48+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm2,xmm2,xmm4
vmovdqu xmm12,XMMWORD[64+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vpaddd xmm3,xmm3,xmm4
vmovdqu xmm12,XMMWORD[80+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[96+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[112+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[128+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[144+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[160+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[176+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[192+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[208+rcx]
vaesenc xmm5,xmm5,xmm12
vaesenc xmm6,xmm6,xmm12
vaesenc xmm7,xmm7,xmm12
vaesenc xmm8,xmm8,xmm12
vmovdqu xmm12,XMMWORD[224+rcx]
vaesenclast xmm5,xmm5,xmm12
vaesenclast xmm6,xmm6,xmm12
vaesenclast xmm7,xmm7,xmm12
vaesenclast xmm8,xmm8,xmm12
vpxor xmm5,xmm5,XMMWORD[rdi]
vpxor xmm6,xmm6,XMMWORD[16+rdi]
vpxor xmm7,xmm7,XMMWORD[32+rdi]
vpxor xmm8,xmm8,XMMWORD[48+rdi]
sub r8,1
vmovdqu XMMWORD[rsi],xmm5
vmovdqu XMMWORD[16+rsi],xmm6
vmovdqu XMMWORD[32+rsi],xmm7
vmovdqu XMMWORD[48+rsi],xmm8
jne NEAR $L$256_enc_msg_x4_loop1
add rsi,64
add rdi,64
$L$256_enc_msg_x4_check_remainder:
cmp r10,0
je NEAR $L$256_enc_msg_x4_out
$L$256_enc_msg_x4_loop2:
vmovdqa xmm5,xmm0
vpaddd xmm0,xmm0,XMMWORD[one]
vpxor xmm5,xmm5,XMMWORD[rcx]
vaesenc xmm5,xmm5,XMMWORD[16+rcx]
vaesenc xmm5,xmm5,XMMWORD[32+rcx]
vaesenc xmm5,xmm5,XMMWORD[48+rcx]
vaesenc xmm5,xmm5,XMMWORD[64+rcx]
vaesenc xmm5,xmm5,XMMWORD[80+rcx]
vaesenc xmm5,xmm5,XMMWORD[96+rcx]
vaesenc xmm5,xmm5,XMMWORD[112+rcx]
vaesenc xmm5,xmm5,XMMWORD[128+rcx]
vaesenc xmm5,xmm5,XMMWORD[144+rcx]
vaesenc xmm5,xmm5,XMMWORD[160+rcx]
vaesenc xmm5,xmm5,XMMWORD[176+rcx]
vaesenc xmm5,xmm5,XMMWORD[192+rcx]
vaesenc xmm5,xmm5,XMMWORD[208+rcx]
vaesenclast xmm5,xmm5,XMMWORD[224+rcx]
vpxor xmm5,xmm5,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm5
add rdi,16
add rsi,16
sub r10,1
jne NEAR $L$256_enc_msg_x4_loop2
$L$256_enc_msg_x4_out:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes256gcmsiv_enc_msg_x4:
global aes256gcmsiv_enc_msg_x8
ALIGN 16
aes256gcmsiv_enc_msg_x8:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
test r8,r8
jnz NEAR $L$256_enc_msg_x8_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$256_enc_msg_x8_start:
mov r11,rsp
sub r11,16
and r11,-64
mov r10,r8
shr r8,4
shl r10,60
jz NEAR $L$256_enc_msg_x8_start2
add r8,1
$L$256_enc_msg_x8_start2:
mov r10,r8
shl r10,61
shr r10,61
vmovdqa xmm1,XMMWORD[rdx]
vpor xmm1,xmm1,XMMWORD[OR_MASK]
vpaddd xmm0,xmm1,XMMWORD[seven]
vmovdqa XMMWORD[r11],xmm0
vpaddd xmm9,xmm1,XMMWORD[one]
vpaddd xmm10,xmm1,XMMWORD[two]
vpaddd xmm11,xmm1,XMMWORD[three]
vpaddd xmm12,xmm1,XMMWORD[four]
vpaddd xmm13,xmm1,XMMWORD[five]
vpaddd xmm14,xmm1,XMMWORD[six]
vmovdqa xmm0,xmm1
shr r8,3
jz NEAR $L$256_enc_msg_x8_check_remainder
sub rsi,128
sub rdi,128
$L$256_enc_msg_x8_loop1:
add rsi,128
add rdi,128
vmovdqa xmm1,xmm0
vmovdqa xmm2,xmm9
vmovdqa xmm3,xmm10
vmovdqa xmm4,xmm11
vmovdqa xmm5,xmm12
vmovdqa xmm6,xmm13
vmovdqa xmm7,xmm14
vmovdqa xmm8,XMMWORD[r11]
vpxor xmm1,xmm1,XMMWORD[rcx]
vpxor xmm2,xmm2,XMMWORD[rcx]
vpxor xmm3,xmm3,XMMWORD[rcx]
vpxor xmm4,xmm4,XMMWORD[rcx]
vpxor xmm5,xmm5,XMMWORD[rcx]
vpxor xmm6,xmm6,XMMWORD[rcx]
vpxor xmm7,xmm7,XMMWORD[rcx]
vpxor xmm8,xmm8,XMMWORD[rcx]
vmovdqu xmm15,XMMWORD[16+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqa xmm14,XMMWORD[r11]
vpaddd xmm14,xmm14,XMMWORD[eight]
vmovdqa XMMWORD[r11],xmm14
vmovdqu xmm15,XMMWORD[32+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpsubd xmm14,xmm14,XMMWORD[one]
vmovdqu xmm15,XMMWORD[48+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm0,xmm0,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[64+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm9,xmm9,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[80+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm10,xmm10,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[96+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm11,xmm11,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[112+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm12,xmm12,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[128+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vpaddd xmm13,xmm13,XMMWORD[eight]
vmovdqu xmm15,XMMWORD[144+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[160+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[176+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[192+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[208+rcx]
vaesenc xmm1,xmm1,xmm15
vaesenc xmm2,xmm2,xmm15
vaesenc xmm3,xmm3,xmm15
vaesenc xmm4,xmm4,xmm15
vaesenc xmm5,xmm5,xmm15
vaesenc xmm6,xmm6,xmm15
vaesenc xmm7,xmm7,xmm15
vaesenc xmm8,xmm8,xmm15
vmovdqu xmm15,XMMWORD[224+rcx]
vaesenclast xmm1,xmm1,xmm15
vaesenclast xmm2,xmm2,xmm15
vaesenclast xmm3,xmm3,xmm15
vaesenclast xmm4,xmm4,xmm15
vaesenclast xmm5,xmm5,xmm15
vaesenclast xmm6,xmm6,xmm15
vaesenclast xmm7,xmm7,xmm15
vaesenclast xmm8,xmm8,xmm15
vpxor xmm1,xmm1,XMMWORD[rdi]
vpxor xmm2,xmm2,XMMWORD[16+rdi]
vpxor xmm3,xmm3,XMMWORD[32+rdi]
vpxor xmm4,xmm4,XMMWORD[48+rdi]
vpxor xmm5,xmm5,XMMWORD[64+rdi]
vpxor xmm6,xmm6,XMMWORD[80+rdi]
vpxor xmm7,xmm7,XMMWORD[96+rdi]
vpxor xmm8,xmm8,XMMWORD[112+rdi]
sub r8,1
vmovdqu XMMWORD[rsi],xmm1
vmovdqu XMMWORD[16+rsi],xmm2
vmovdqu XMMWORD[32+rsi],xmm3
vmovdqu XMMWORD[48+rsi],xmm4
vmovdqu XMMWORD[64+rsi],xmm5
vmovdqu XMMWORD[80+rsi],xmm6
vmovdqu XMMWORD[96+rsi],xmm7
vmovdqu XMMWORD[112+rsi],xmm8
jne NEAR $L$256_enc_msg_x8_loop1
add rsi,128
add rdi,128
$L$256_enc_msg_x8_check_remainder:
cmp r10,0
je NEAR $L$256_enc_msg_x8_out
$L$256_enc_msg_x8_loop2:
vmovdqa xmm1,xmm0
vpaddd xmm0,xmm0,XMMWORD[one]
vpxor xmm1,xmm1,XMMWORD[rcx]
vaesenc xmm1,xmm1,XMMWORD[16+rcx]
vaesenc xmm1,xmm1,XMMWORD[32+rcx]
vaesenc xmm1,xmm1,XMMWORD[48+rcx]
vaesenc xmm1,xmm1,XMMWORD[64+rcx]
vaesenc xmm1,xmm1,XMMWORD[80+rcx]
vaesenc xmm1,xmm1,XMMWORD[96+rcx]
vaesenc xmm1,xmm1,XMMWORD[112+rcx]
vaesenc xmm1,xmm1,XMMWORD[128+rcx]
vaesenc xmm1,xmm1,XMMWORD[144+rcx]
vaesenc xmm1,xmm1,XMMWORD[160+rcx]
vaesenc xmm1,xmm1,XMMWORD[176+rcx]
vaesenc xmm1,xmm1,XMMWORD[192+rcx]
vaesenc xmm1,xmm1,XMMWORD[208+rcx]
vaesenclast xmm1,xmm1,XMMWORD[224+rcx]
vpxor xmm1,xmm1,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm1
add rdi,16
add rsi,16
sub r10,1
jnz NEAR $L$256_enc_msg_x8_loop2
$L$256_enc_msg_x8_out:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes256gcmsiv_enc_msg_x8:
global aes256gcmsiv_dec
ALIGN 16
aes256gcmsiv_dec:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_dec:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8,QWORD[40+rsp]
mov r9,QWORD[48+rsp]
test r9,~15
jnz NEAR $L$256_dec_start
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$256_dec_start:
vzeroupper
vmovdqa xmm0,XMMWORD[rdx]
mov rax,rdx
lea rax,[32+rax]
lea rcx,[32+rcx]
vmovdqu xmm15,XMMWORD[r9*1+rdi]
vpor xmm15,xmm15,XMMWORD[OR_MASK]
and r9,~15
cmp r9,96
jb NEAR $L$256_dec_loop2
sub r9,96
vmovdqa xmm7,xmm15
vpaddd xmm8,xmm7,XMMWORD[one]
vpaddd xmm9,xmm7,XMMWORD[two]
vpaddd xmm10,xmm9,XMMWORD[one]
vpaddd xmm11,xmm9,XMMWORD[two]
vpaddd xmm12,xmm11,XMMWORD[one]
vpaddd xmm15,xmm11,XMMWORD[two]
vpxor xmm7,xmm7,XMMWORD[r8]
vpxor xmm8,xmm8,XMMWORD[r8]
vpxor xmm9,xmm9,XMMWORD[r8]
vpxor xmm10,xmm10,XMMWORD[r8]
vpxor xmm11,xmm11,XMMWORD[r8]
vpxor xmm12,xmm12,XMMWORD[r8]
vmovdqu xmm4,XMMWORD[16+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[32+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[48+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[64+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[80+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[96+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[112+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[128+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[144+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[160+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[176+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[192+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[208+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[224+r8]
vaesenclast xmm7,xmm7,xmm4
vaesenclast xmm8,xmm8,xmm4
vaesenclast xmm9,xmm9,xmm4
vaesenclast xmm10,xmm10,xmm4
vaesenclast xmm11,xmm11,xmm4
vaesenclast xmm12,xmm12,xmm4
vpxor xmm7,xmm7,XMMWORD[rdi]
vpxor xmm8,xmm8,XMMWORD[16+rdi]
vpxor xmm9,xmm9,XMMWORD[32+rdi]
vpxor xmm10,xmm10,XMMWORD[48+rdi]
vpxor xmm11,xmm11,XMMWORD[64+rdi]
vpxor xmm12,xmm12,XMMWORD[80+rdi]
vmovdqu XMMWORD[rsi],xmm7
vmovdqu XMMWORD[16+rsi],xmm8
vmovdqu XMMWORD[32+rsi],xmm9
vmovdqu XMMWORD[48+rsi],xmm10
vmovdqu XMMWORD[64+rsi],xmm11
vmovdqu XMMWORD[80+rsi],xmm12
add rdi,96
add rsi,96
jmp NEAR $L$256_dec_loop1
ALIGN 64
$L$256_dec_loop1:
cmp r9,96
jb NEAR $L$256_dec_finish_96
sub r9,96
vmovdqa xmm6,xmm12
vmovdqa XMMWORD[(16-32)+rax],xmm11
vmovdqa XMMWORD[(32-32)+rax],xmm10
vmovdqa XMMWORD[(48-32)+rax],xmm9
vmovdqa XMMWORD[(64-32)+rax],xmm8
vmovdqa XMMWORD[(80-32)+rax],xmm7
vmovdqa xmm7,xmm15
vpaddd xmm8,xmm7,XMMWORD[one]
vpaddd xmm9,xmm7,XMMWORD[two]
vpaddd xmm10,xmm9,XMMWORD[one]
vpaddd xmm11,xmm9,XMMWORD[two]
vpaddd xmm12,xmm11,XMMWORD[one]
vpaddd xmm15,xmm11,XMMWORD[two]
vmovdqa xmm4,XMMWORD[r8]
vpxor xmm7,xmm7,xmm4
vpxor xmm8,xmm8,xmm4
vpxor xmm9,xmm9,xmm4
vpxor xmm10,xmm10,xmm4
vpxor xmm11,xmm11,xmm4
vpxor xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[((0-32))+rcx]
vpclmulqdq xmm2,xmm6,xmm4,0x11
vpclmulqdq xmm3,xmm6,xmm4,0x00
vpclmulqdq xmm1,xmm6,xmm4,0x01
vpclmulqdq xmm4,xmm6,xmm4,0x10
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[16+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[((-16))+rax]
vmovdqu xmm13,XMMWORD[((-16))+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[32+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[rax]
vmovdqu xmm13,XMMWORD[rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[48+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[16+rax]
vmovdqu xmm13,XMMWORD[16+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[64+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[32+rax]
vmovdqu xmm13,XMMWORD[32+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[80+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[96+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[112+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqa xmm6,XMMWORD[((80-32))+rax]
vpxor xmm6,xmm6,xmm0
vmovdqu xmm5,XMMWORD[((80-32))+rcx]
vpclmulqdq xmm4,xmm6,xmm5,0x01
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x10
vpxor xmm1,xmm1,xmm4
vmovdqu xmm4,XMMWORD[128+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vpsrldq xmm4,xmm1,8
vpxor xmm5,xmm2,xmm4
vpslldq xmm4,xmm1,8
vpxor xmm0,xmm3,xmm4
vmovdqa xmm3,XMMWORD[poly]
vmovdqu xmm4,XMMWORD[144+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[160+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[176+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[192+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm4,XMMWORD[208+r8]
vaesenc xmm7,xmm7,xmm4
vaesenc xmm8,xmm8,xmm4
vaesenc xmm9,xmm9,xmm4
vaesenc xmm10,xmm10,xmm4
vaesenc xmm11,xmm11,xmm4
vaesenc xmm12,xmm12,xmm4
vmovdqu xmm6,XMMWORD[224+r8]
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpxor xmm4,xmm6,XMMWORD[rdi]
vaesenclast xmm7,xmm7,xmm4
vpxor xmm4,xmm6,XMMWORD[16+rdi]
vaesenclast xmm8,xmm8,xmm4
vpxor xmm4,xmm6,XMMWORD[32+rdi]
vaesenclast xmm9,xmm9,xmm4
vpxor xmm4,xmm6,XMMWORD[48+rdi]
vaesenclast xmm10,xmm10,xmm4
vpxor xmm4,xmm6,XMMWORD[64+rdi]
vaesenclast xmm11,xmm11,xmm4
vpxor xmm4,xmm6,XMMWORD[80+rdi]
vaesenclast xmm12,xmm12,xmm4
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vmovdqu XMMWORD[rsi],xmm7
vmovdqu XMMWORD[16+rsi],xmm8
vmovdqu XMMWORD[32+rsi],xmm9
vmovdqu XMMWORD[48+rsi],xmm10
vmovdqu XMMWORD[64+rsi],xmm11
vmovdqu XMMWORD[80+rsi],xmm12
vpxor xmm0,xmm0,xmm5
lea rdi,[96+rdi]
lea rsi,[96+rsi]
jmp NEAR $L$256_dec_loop1
$L$256_dec_finish_96:
vmovdqa xmm6,xmm12
vmovdqa XMMWORD[(16-32)+rax],xmm11
vmovdqa XMMWORD[(32-32)+rax],xmm10
vmovdqa XMMWORD[(48-32)+rax],xmm9
vmovdqa XMMWORD[(64-32)+rax],xmm8
vmovdqa XMMWORD[(80-32)+rax],xmm7
vmovdqu xmm4,XMMWORD[((0-32))+rcx]
vpclmulqdq xmm1,xmm6,xmm4,0x10
vpclmulqdq xmm2,xmm6,xmm4,0x11
vpclmulqdq xmm3,xmm6,xmm4,0x00
vpclmulqdq xmm4,xmm6,xmm4,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[((-16))+rax]
vmovdqu xmm13,XMMWORD[((-16))+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[rax]
vmovdqu xmm13,XMMWORD[rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[16+rax]
vmovdqu xmm13,XMMWORD[16+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[32+rax]
vmovdqu xmm13,XMMWORD[32+rcx]
vpclmulqdq xmm4,xmm6,xmm13,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm13,0x01
vpxor xmm1,xmm1,xmm4
vmovdqu xmm6,XMMWORD[((80-32))+rax]
vpxor xmm6,xmm6,xmm0
vmovdqu xmm5,XMMWORD[((80-32))+rcx]
vpclmulqdq xmm4,xmm6,xmm5,0x11
vpxor xmm2,xmm2,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x00
vpxor xmm3,xmm3,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x10
vpxor xmm1,xmm1,xmm4
vpclmulqdq xmm4,xmm6,xmm5,0x01
vpxor xmm1,xmm1,xmm4
vpsrldq xmm4,xmm1,8
vpxor xmm5,xmm2,xmm4
vpslldq xmm4,xmm1,8
vpxor xmm0,xmm3,xmm4
vmovdqa xmm3,XMMWORD[poly]
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpalignr xmm2,xmm0,xmm0,8
vpclmulqdq xmm0,xmm0,xmm3,0x10
vpxor xmm0,xmm2,xmm0
vpxor xmm0,xmm0,xmm5
$L$256_dec_loop2:
cmp r9,16
jb NEAR $L$256_dec_out
sub r9,16
vmovdqa xmm2,xmm15
vpaddd xmm15,xmm15,XMMWORD[one]
vpxor xmm2,xmm2,XMMWORD[r8]
vaesenc xmm2,xmm2,XMMWORD[16+r8]
vaesenc xmm2,xmm2,XMMWORD[32+r8]
vaesenc xmm2,xmm2,XMMWORD[48+r8]
vaesenc xmm2,xmm2,XMMWORD[64+r8]
vaesenc xmm2,xmm2,XMMWORD[80+r8]
vaesenc xmm2,xmm2,XMMWORD[96+r8]
vaesenc xmm2,xmm2,XMMWORD[112+r8]
vaesenc xmm2,xmm2,XMMWORD[128+r8]
vaesenc xmm2,xmm2,XMMWORD[144+r8]
vaesenc xmm2,xmm2,XMMWORD[160+r8]
vaesenc xmm2,xmm2,XMMWORD[176+r8]
vaesenc xmm2,xmm2,XMMWORD[192+r8]
vaesenc xmm2,xmm2,XMMWORD[208+r8]
vaesenclast xmm2,xmm2,XMMWORD[224+r8]
vpxor xmm2,xmm2,XMMWORD[rdi]
vmovdqu XMMWORD[rsi],xmm2
add rdi,16
add rsi,16
vpxor xmm0,xmm0,xmm2
vmovdqa xmm1,XMMWORD[((-32))+rcx]
call GFMUL
jmp NEAR $L$256_dec_loop2
$L$256_dec_out:
vmovdqu XMMWORD[rdx],xmm0
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aes256gcmsiv_dec:
global aes256gcmsiv_kdf
ALIGN 16
aes256gcmsiv_kdf:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_aes256gcmsiv_kdf:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
vmovdqa xmm1,XMMWORD[rdx]
vmovdqa xmm4,XMMWORD[rdi]
vmovdqa xmm11,XMMWORD[and_mask]
vmovdqa xmm8,XMMWORD[one]
vpshufd xmm4,xmm4,0x90
vpand xmm4,xmm4,xmm11
vpaddd xmm6,xmm4,xmm8
vpaddd xmm7,xmm6,xmm8
vpaddd xmm11,xmm7,xmm8
vpaddd xmm12,xmm11,xmm8
vpaddd xmm13,xmm12,xmm8
vpxor xmm4,xmm4,xmm1
vpxor xmm6,xmm6,xmm1
vpxor xmm7,xmm7,xmm1
vpxor xmm11,xmm11,xmm1
vpxor xmm12,xmm12,xmm1
vpxor xmm13,xmm13,xmm1
vmovdqa xmm1,XMMWORD[16+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[32+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vaesenc xmm13,xmm13,xmm2
vmovdqa xmm1,XMMWORD[48+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[64+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vaesenc xmm13,xmm13,xmm2
vmovdqa xmm1,XMMWORD[80+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[96+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vaesenc xmm13,xmm13,xmm2
vmovdqa xmm1,XMMWORD[112+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[128+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vaesenc xmm13,xmm13,xmm2
vmovdqa xmm1,XMMWORD[144+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[160+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
vaesenc xmm13,xmm13,xmm2
vmovdqa xmm1,XMMWORD[176+rdx]
vaesenc xmm4,xmm4,xmm1
vaesenc xmm6,xmm6,xmm1
vaesenc xmm7,xmm7,xmm1
vaesenc xmm11,xmm11,xmm1
vaesenc xmm12,xmm12,xmm1
vaesenc xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[192+rdx]
vaesenc xmm4,xmm4,xmm2
vaesenc xmm6,xmm6,xmm2
vaesenc xmm7,xmm7,xmm2
vaesenc xmm11,xmm11,xmm2
vaesenc xmm12,xmm12,xmm2
v