src/third_party/boringssl/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl - cobalt - Git at Google

 #!/usr/bin/env perl

 # Copyright (c) 2015, CloudFlare Ltd.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
 # copyright notice and this permission notice appear in all copies.
 #
 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

 ##############################################################################
 #                                                                            #
 # Author:  Vlad Krasnov                                                      #
 #                                                                            #
 ##############################################################################

 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";

 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;

 $avx = 2;

 $code.=<<___;
 .text
 .extern OPENSSL_ia32cap_P

 chacha20_poly1305_constants:

 .align 64
 .chacha20_consts:
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .rol8:
 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
 .rol16:
 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
 .avx2_init:
 .long 0,0,0,0
 .sse_inc:
 .long 1,0,0,0
 .avx2_inc:
 .long 2,0,0,0,2,0,0,0
 .clamp:
 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
 .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
 .align 16
 .and_masks:
 .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 ___

 my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
 my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
 my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
 my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
 my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
 my $r_store="0*16(%rbp)";
 my $s_store="1*16(%rbp)";
 my $len_store="2*16(%rbp)";
 my $state1_store="3*16(%rbp)";
 my $state2_store="4*16(%rbp)";
 my $tmp_store="5*16(%rbp)";
 my $ctr0_store="6*16(%rbp)";
 my $ctr1_store="7*16(%rbp)";
 my $ctr2_store="8*16(%rbp)";
 my $ctr3_store="9*16(%rbp)";

 sub chacha_qr {
 my ($a,$b,$c,$d,$t,$dir)=@_;
 $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
 $code.="paddd $b, $a
         pxor $a, $d
         pshufb .rol16(%rip), $d
         paddd $d, $c
         pxor $c, $b
         movdqa $b, $t
         pslld \$12, $t
         psrld \$20, $b
         pxor $t, $b
         paddd $b, $a
         pxor $a, $d
         pshufb .rol8(%rip), $d
         paddd $d, $c
         pxor $c, $b
         movdqa $b, $t
         pslld \$7, $t
         psrld \$25, $b
         pxor $t, $b\n";
 $code.="palignr \$4, $b, $b
         palignr \$8, $c, $c
         palignr \$12, $d, $d\n" if ($dir =~ /left/);
 $code.="palignr \$12, $b, $b
         palignr \$8, $c, $c
         palignr \$4, $d, $d\n" if ($dir =~ /right/);
 $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
 }

 sub poly_add {
 my ($src)=@_;
 $code.="add $src, $acc0
         adc 8+$src, $acc1
         adc \$1, $acc2\n";
 }

 sub poly_stage1 {
 $code.="mov 0+$r_store, %rax
         mov %rax, $t2
         mul $acc0
         mov %rax, $t0
         mov %rdx, $t1
         mov 0+$r_store, %rax
         mul $acc1
         imulq $acc2, $t2
         add %rax, $t1
         adc %rdx, $t2\n";
 }

 sub poly_stage2 {
 $code.="mov 8+$r_store, %rax
         mov %rax, $t3
         mul $acc0
         add %rax, $t1
         adc \$0, %rdx
         mov %rdx, $acc0
         mov 8+$r_store, %rax
         mul $acc1
         add %rax, $t2
         adc \$0, %rdx\n";
 }

 sub poly_stage3 {
 $code.="imulq $acc2, $t3
         add $acc0, $t2
         adc %rdx, $t3\n";
 }

 sub poly_reduce_stage {
 $code.="mov $t0, $acc0
         mov $t1, $acc1
         mov $t2, $acc2
         and \$3, $acc2
         mov $t2, $t0
         and \$-4, $t0
         mov $t3, $t1
         shrd \$2, $t3, $t2
         shr \$2, $t3
         add $t0, $acc0
         adc $t1, $acc1
         adc \$0, $acc2
         add $t2, $acc0
         adc $t3, $acc1
         adc \$0, $acc2\n";
 }

 sub poly_mul {
     &poly_stage1();
     &poly_stage2();
     &poly_stage3();
     &poly_reduce_stage();
 }

 sub prep_state {
 my ($n)=@_;
 $code.="movdqa .chacha20_consts(%rip), $A0
         movdqa $state1_store, $B0
         movdqa $state2_store, $C0\n";
 $code.="movdqa $A0, $A1
         movdqa $B0, $B1
         movdqa $C0, $C1\n" if ($n ge 2);
 $code.="movdqa $A0, $A2
         movdqa $B0, $B2
         movdqa $C0, $C2\n" if ($n ge 3);
 $code.="movdqa $A0, $A3
         movdqa $B0, $B3
         movdqa $C0, $C3\n" if ($n ge 4);
 $code.="movdqa $ctr0_store, $D0
         paddd .sse_inc(%rip), $D0
         movdqa $D0, $ctr0_store\n" if ($n eq 1);
 $code.="movdqa $ctr0_store, $D1
         paddd .sse_inc(%rip), $D1
         movdqa $D1, $D0
         paddd .sse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store\n" if ($n eq 2);
 $code.="movdqa $ctr0_store, $D2
         paddd .sse_inc(%rip), $D2
         movdqa $D2, $D1
         paddd .sse_inc(%rip), $D1
         movdqa $D1, $D0
         paddd .sse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store
         movdqa $D2, $ctr2_store\n" if ($n eq 3);
 $code.="movdqa $ctr0_store, $D3
         paddd .sse_inc(%rip), $D3
         movdqa $D3, $D2
         paddd .sse_inc(%rip), $D2
         movdqa $D2, $D1
         paddd .sse_inc(%rip), $D1
         movdqa $D1, $D0
         paddd .sse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store
         movdqa $D2, $ctr2_store
         movdqa $D3, $ctr3_store\n" if ($n eq 4);
 }

 sub finalize_state {
 my ($n)=@_;
 $code.="paddd .chacha20_consts(%rip), $A3
         paddd $state1_store, $B3
         paddd $state2_store, $C3
         paddd $ctr3_store, $D3\n" if ($n eq 4);
 $code.="paddd .chacha20_consts(%rip), $A2
         paddd $state1_store, $B2
         paddd $state2_store, $C2
         paddd $ctr2_store, $D2\n" if ($n ge 3);
 $code.="paddd .chacha20_consts(%rip), $A1
         paddd $state1_store, $B1
         paddd $state2_store, $C1
         paddd $ctr1_store, $D1\n" if ($n ge 2);
 $code.="paddd .chacha20_consts(%rip), $A0
         paddd $state1_store, $B0
         paddd $state2_store, $C0
         paddd $ctr0_store, $D0\n";
 }

 sub xor_stream {
 my ($A, $B, $C, $D, $offset)=@_;
 $code.="movdqu 0*16 + $offset($inp), $A3
         movdqu 1*16 + $offset($inp), $B3
         movdqu 2*16 + $offset($inp), $C3
         movdqu 3*16 + $offset($inp), $D3
         pxor $A3, $A
         pxor $B3, $B
         pxor $C3, $C
         pxor $D, $D3
         movdqu $A, 0*16 + $offset($oup)
         movdqu $B, 1*16 + $offset($oup)
         movdqu $C, 2*16 + $offset($oup)
         movdqu $D3, 3*16 + $offset($oup)\n";
 }

 sub xor_stream_using_temp {
 my ($A, $B, $C, $D, $offset, $temp)=@_;
 $code.="movdqa $temp, $tmp_store
         movdqu 0*16 + $offset($inp), $temp
         pxor $A, $temp
         movdqu $temp, 0*16 + $offset($oup)
         movdqu 1*16 + $offset($inp), $temp
         pxor $B, $temp
         movdqu $temp, 1*16 + $offset($oup)
         movdqu 2*16 + $offset($inp), $temp
         pxor $C, $temp
         movdqu $temp, 2*16 + $offset($oup)
         movdqu 3*16 + $offset($inp), $temp
         pxor $D, $temp
         movdqu $temp, 3*16 + $offset($oup)\n";
 }

 sub gen_chacha_round {
 my ($rot1, $rot2, $shift)=@_;
 my $round="";
 $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
 $round.="movdqa $rot2, $C0
          paddd $B3, $A3
          paddd $B2, $A2
          paddd $B1, $A1
          paddd $B0, $A0
          pxor $A3, $D3
          pxor $A2, $D2
          pxor $A1, $D1
          pxor $A0, $D0
          pshufb $C0, $D3
          pshufb $C0, $D2
          pshufb $C0, $D1
          pshufb $C0, $D0
          movdqa $tmp_store, $C0
          paddd $D3, $C3
          paddd $D2, $C2
          paddd $D1, $C1
          paddd $D0, $C0
          pxor $C3, $B3
          pxor $C2, $B2
          pxor $C1, $B1
          pxor $C0, $B0
          movdqa $C0, $tmp_store
          movdqa $B3, $C0
          psrld \$$rot1, $C0
          pslld \$32-$rot1, $B3
          pxor $C0, $B3
          movdqa $B2, $C0
          psrld \$$rot1, $C0
          pslld \$32-$rot1, $B2
          pxor $C0, $B2
          movdqa $B1, $C0
          psrld \$$rot1, $C0
          pslld \$32-$rot1, $B1
          pxor $C0, $B1
          movdqa $B0, $C0
          psrld \$$rot1, $C0
          pslld \$32-$rot1, $B0
          pxor $C0, $B0\n";
 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
 $round.="movdqa $tmp_store, $C0
          palignr \$$s1, $B3, $B3
          palignr \$$s2, $C3, $C3
          palignr \$$s3, $D3, $D3
          palignr \$$s1, $B2, $B2
          palignr \$$s2, $C2, $C2
          palignr \$$s3, $D2, $D2
          palignr \$$s1, $B1, $B1
          palignr \$$s2, $C1, $C1
          palignr \$$s3, $D1, $D1
          palignr \$$s1, $B0, $B0
          palignr \$$s2, $C0, $C0
          palignr \$$s3, $D0, $D0\n"
 if (($shift =~ /left/) || ($shift =~ /right/));
 return $round;
 };

 $chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
                &gen_chacha_round(25, ".rol8(%rip)", "left") .
                &gen_chacha_round(20, ".rol16(%rip)") .
                &gen_chacha_round(25, ".rol8(%rip)", "right");

 my @loop_body = split /\n/, $chacha_body;

 sub emit_body {
 my ($n)=@_;
     for (my $i=0; $i < $n; $i++) {
         $code=$code.shift(@loop_body)."\n";
     };
 }

 {
 ################################################################################
 # void poly_hash_ad_internal();
 $code.="
 .type poly_hash_ad_internal,\@function,2
 .align 64
 poly_hash_ad_internal:
 .cfi_startproc
     xor $acc0, $acc0
     xor $acc1, $acc1
     xor $acc2, $acc2
     cmp \$13,  $itr2
     jne hash_ad_loop
 poly_fast_tls_ad:
     # Special treatment for the TLS case of 13 bytes
     mov ($adp), $acc0
     mov 5($adp), $acc1
     shr \$24, $acc1
     mov \$1, $acc2\n";
     &poly_mul(); $code.="
     ret
 hash_ad_loop:
         # Hash in 16 byte chunk
         cmp \$16, $itr2
         jb hash_ad_tail\n";
         &poly_add("0($adp)");
         &poly_mul(); $code.="
         lea 1*16($adp), $adp
         sub \$16, $itr2
     jmp hash_ad_loop
 hash_ad_tail:
     cmp \$0, $itr2
     je 1f
     # Hash last < 16 byte tail
     xor $t0, $t0
     xor $t1, $t1
     xor $t2, $t2
     add $itr2, $adp
 hash_ad_tail_loop:
         shld \$8, $t0, $t1
         shl \$8, $t0
         movzxb -1($adp), $t2
         xor $t2, $t0
         dec $adp
         dec $itr2
     jne hash_ad_tail_loop

     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="
     # Finished AD
 1:
     ret
 .cfi_endproc
 .size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
 }

 {
 ################################################################################
 # void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
 $code.="
 .globl chacha20_poly1305_open
 .type chacha20_poly1305_open,\@function,2
 .align 64
 chacha20_poly1305_open:
 .cfi_startproc
     push %rbp
 .cfi_adjust_cfa_offset 8
     push %rbx
 .cfi_adjust_cfa_offset 8
     push %r12
 .cfi_adjust_cfa_offset 8
     push %r13
 .cfi_adjust_cfa_offset 8
     push %r14
 .cfi_adjust_cfa_offset 8
     push %r15
 .cfi_adjust_cfa_offset 8
     # We write the calculated authenticator back to keyp at the end, so save
     # the pointer on the stack too.
     push $keyp
 .cfi_adjust_cfa_offset 8
     sub \$288 + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
 .cfi_offset rbp, -16
 .cfi_offset rbx, -24
 .cfi_offset r12, -32
 .cfi_offset r13, -40
 .cfi_offset r14, -48
 .cfi_offset r15, -56
     lea 32(%rsp), %rbp
     and \$-32, %rbp
     mov %rdx, 8+$len_store
     mov %r8, 0+$len_store
     mov %rdx, $inl\n"; $code.="
     mov OPENSSL_ia32cap_P+8(%rip), %eax
     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
     xor \$`(1<<5) + (1<<8)`, %eax
     jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
 $code.="
 1:
     cmp \$128, $inl
     jbe open_sse_128
     # For long buffers, prepare the poly key first
     movdqa .chacha20_consts(%rip), $A0
     movdqu 0*16($keyp), $B0
     movdqu 1*16($keyp), $C0
     movdqu 2*16($keyp), $D0
     movdqa $D0, $T1
     # Store on stack, to free keyp
     movdqa $B0, $state1_store
     movdqa $C0, $state2_store
     movdqa $D0, $ctr0_store
     mov \$10, $acc0
 1:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
     paddd .chacha20_consts(%rip), $A0
     paddd $state1_store, $B0
     # Clamp and store the key
     pand .clamp(%rip), $A0
     movdqa $A0, $r_store
     movdqa $B0, $s_store
     # Hash
     mov %r8, $itr2
     call poly_hash_ad_internal
 open_sse_main_loop:
         cmp \$16*16, $inl
         jb 2f
         # Load state, increment counter blocks\n";
         &prep_state(4); $code.="
         # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
         # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
         mov \$4, $itr1
         mov $inp, $itr2
 1:  \n";
             &emit_body(20);
             &poly_add("0($itr2)"); $code.="
             lea 2*8($itr2), $itr2\n";
             &emit_body(20);
             &poly_stage1();
             &emit_body(20);
             &poly_stage2();
             &emit_body(20);
             &poly_stage3();
             &emit_body(20);
             &poly_reduce_stage();
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             dec $itr1
         jge 1b\n";
             &poly_add("0($itr2)");
             &poly_mul(); $code.="
             lea 2*8($itr2), $itr2
             cmp \$-6, $itr1
         jg 1b\n";
         &finalize_state(4);
         &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
         &xor_stream($A2, $B2, $C2, $D2, "4*16");
         &xor_stream($A1, $B1, $C1, $D1, "8*16");
         &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
         lea 16*16($inp), $inp
         lea 16*16($oup), $oup
         sub \$16*16, $inl
     jmp open_sse_main_loop
 2:
     # Handle the various tail sizes efficiently
     test $inl, $inl
     jz open_sse_finalize
     cmp \$4*16, $inl
     ja 3f\n";
 ###############################################################################
     # At most 64 bytes are left
     &prep_state(1); $code.="
     xor $itr2, $itr2
     mov $inl, $itr1
     cmp \$16, $itr1
     jb 2f
 1:  \n";
         &poly_add("0($inp, $itr2)");
         &poly_mul(); $code.="
         sub \$16, $itr1
 2:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp \$16, $itr1
     jae 1b
         cmp \$10*16, $itr2
     jne 2b\n";
     &finalize_state(1); $code.="
     jmp open_sse_tail_64_dec_loop
 3:
     cmp \$8*16, $inl
     ja 3f\n";
 ###############################################################################
     # 65 - 128 bytes are left
     &prep_state(2); $code.="
     mov $inl, $itr1
     and \$-16, $itr1
     xor $itr2, $itr2
 1:  \n";
         &poly_add("0($inp, $itr2)");
         &poly_mul(); $code.="
 2:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
         cmp $itr1, $itr2
     jb 1b
         cmp \$10*16, $itr2
     jne 2b\n";
     &finalize_state(2);
     &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
     sub \$4*16, $inl
     lea 4*16($inp), $inp
     lea 4*16($oup), $oup
     jmp open_sse_tail_64_dec_loop
 3:
     cmp \$12*16, $inl
     ja 3f\n";
 ###############################################################################
     # 129 - 192 bytes are left
     &prep_state(3); $code.="
     mov $inl, $itr1
     mov \$10*16, $itr2
     cmp \$10*16, $itr1
     cmovg $itr2, $itr1
     and \$-16, $itr1
     xor $itr2, $itr2
 1:  \n";
         &poly_add("0($inp, $itr2)");
         &poly_mul(); $code.="
 2:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         cmp $itr1, $itr2
     jb 1b
         cmp \$10*16, $itr2
     jne 2b
     cmp \$11*16, $inl
     jb 1f\n";
     &poly_add("10*16($inp)");
     &poly_mul(); $code.="
     cmp \$12*16, $inl
     jb 1f\n";
     &poly_add("11*16($inp)");
     &poly_mul(); $code.="
 1:  \n";
     &finalize_state(3);
     &xor_stream($A2, $B2, $C2, $D2, "0*16");
     &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
     sub \$8*16, $inl
     lea 8*16($inp), $inp
     lea 8*16($oup), $oup
     jmp open_sse_tail_64_dec_loop
 3:
 ###############################################################################\n";
     # 193 - 255 bytes are left
     &prep_state(4); $code.="
     xor $itr2, $itr2
 1:  \n";
         &poly_add("0($inp, $itr2)");
         &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
         &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
         &poly_stage1();
         &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
         &poly_stage2();
         &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
         &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
         &poly_stage3();
         &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
         &poly_reduce_stage();
         &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
         add \$16, $itr2
         cmp \$10*16, $itr2
     jb 1b
     mov $inl, $itr1
     and \$-16, $itr1
 1:  \n";
         &poly_add("0($inp, $itr2)");
         &poly_mul(); $code.="
         add \$16, $itr2
         cmp $itr1, $itr2
     jb 1b\n";
     &finalize_state(4);
     &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
     &xor_stream($A2, $B2, $C2, $D2, "4*16");
     &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
     movdqa $tmp_store, $D0
     sub \$12*16, $inl
     lea 12*16($inp), $inp
     lea 12*16($oup), $oup
 ###############################################################################
     # Decrypt the remaining data, 16B at a time, using existing stream
 open_sse_tail_64_dec_loop:
     cmp \$16, $inl
     jb 1f
         sub \$16, $inl
         movdqu ($inp), $T0
         pxor $T0, $A0
         movdqu $A0, ($oup)
         lea 16($inp), $inp
         lea 16($oup), $oup
         movdqa $B0, $A0
         movdqa $C0, $B0
         movdqa $D0, $C0
     jmp open_sse_tail_64_dec_loop
 1:
     movdqa $A0, $A1

     # Decrypt up to 16 bytes at the end.
 open_sse_tail_16:
     test $inl, $inl
     jz open_sse_finalize

     # Read the final bytes into $T0. They need to be read in reverse order so
     # that they end up in the correct order in $T0.
     pxor $T0, $T0
     lea -1($inp, $inl), $inp
     movq $inl, $itr2
 2:
         pslldq \$1, $T0
         pinsrb \$0, ($inp), $T0
         sub \$1, $inp
         sub \$1, $itr2
         jnz 2b

 3:
     movq $T0, $t0
     pextrq \$1, $T0, $t1
     # The final bytes of keystream are in $A1.
     pxor $A1, $T0

     # Copy the plaintext bytes out.
 2:
         pextrb \$0, $T0, ($oup)
         psrldq \$1, $T0
         add \$1, $oup
         sub \$1, $inl
     jne 2b

     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="

 open_sse_finalize:\n";
     &poly_add($len_store);
     &poly_mul(); $code.="
     # Final reduce
     mov $acc0, $t0
     mov $acc1, $t1
     mov $acc2, $t2
     sub \$-5, $acc0
     sbb \$-1, $acc1
     sbb \$3, $acc2
     cmovc $t0, $acc0
     cmovc $t1, $acc1
     cmovc $t2, $acc2
     # Add in s part of the key
     add 0+$s_store, $acc0
     adc 8+$s_store, $acc1

     add \$288 + 32, %rsp
 .cfi_adjust_cfa_offset -(288 + 32)
     pop $keyp
 .cfi_adjust_cfa_offset -8
     movq $acc0, ($keyp)
     movq $acc1, 8($keyp)

     pop %r15
 .cfi_adjust_cfa_offset -8
     pop %r14
 .cfi_adjust_cfa_offset -8
     pop %r13
 .cfi_adjust_cfa_offset -8
     pop %r12
 .cfi_adjust_cfa_offset -8
     pop %rbx
 .cfi_adjust_cfa_offset -8
     pop %rbp
 .cfi_adjust_cfa_offset -8
     ret
 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
 ###############################################################################
 open_sse_128:
     movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
     movdqu 2*16($keyp), $D0
     movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
     movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
     mov \$10, $acc0
 1:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
     dec $acc0
     jnz 1b
     paddd .chacha20_consts(%rip), $A0
     paddd .chacha20_consts(%rip), $A1
     paddd .chacha20_consts(%rip), $A2
     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
     paddd $T2, $C1\npaddd $T2, $C2
     paddd $T3, $D1
     paddd .sse_inc(%rip), $T3
     paddd $T3, $D2
     # Clamp and store the key
     pand .clamp(%rip), $A0
     movdqa $A0, $r_store
     movdqa $B0, $s_store
     # Hash
     mov %r8, $itr2
     call poly_hash_ad_internal
 1:
         cmp \$16, $inl
         jb open_sse_tail_16
         sub \$16, $inl\n";
         # Load for hashing
         &poly_add("0*8($inp)"); $code.="
         # Load for decryption
         movdqu 0*16($inp), $T0
         pxor $T0, $A1
         movdqu $A1, 0*16($oup)
         lea 1*16($inp), $inp
         lea 1*16($oup), $oup\n";
         &poly_mul(); $code.="
         # Shift the stream left
         movdqa $B1, $A1
         movdqa $C1, $B1
         movdqa $D1, $C1
         movdqa $A2, $D1
         movdqa $B2, $A2
         movdqa $C2, $B2
         movdqa $D2, $C2
     jmp 1b
     jmp open_sse_tail_16
 .size chacha20_poly1305_open, .-chacha20_poly1305_open
 .cfi_endproc

 ################################################################################
 ################################################################################
 # void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
 .globl  chacha20_poly1305_seal
 .type chacha20_poly1305_seal,\@function,2
 .align 64
 chacha20_poly1305_seal:
 .cfi_startproc
     push %rbp
 .cfi_adjust_cfa_offset 8
     push %rbx
 .cfi_adjust_cfa_offset 8
     push %r12
 .cfi_adjust_cfa_offset 8
     push %r13
 .cfi_adjust_cfa_offset 8
     push %r14
 .cfi_adjust_cfa_offset 8
     push %r15
 .cfi_adjust_cfa_offset 8
     # We write the calculated authenticator back to keyp at the end, so save
     # the pointer on the stack too.
     push $keyp
 .cfi_adjust_cfa_offset 8
     sub \$288 + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
 .cfi_offset rbp, -16
 .cfi_offset rbx, -24
 .cfi_offset r12, -32
 .cfi_offset r13, -40
 .cfi_offset r14, -48
 .cfi_offset r15, -56
     lea 32(%rsp), %rbp
     and \$-32, %rbp
     mov 56($keyp), $inl  # extra_in_len
     addq %rdx, $inl
     mov $inl, 8+$len_store
     mov %r8, 0+$len_store
     mov %rdx, $inl\n"; $code.="
     mov OPENSSL_ia32cap_P+8(%rip), %eax
     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
     xor \$`(1<<5) + (1<<8)`, %eax
     jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
 $code.="
     cmp \$128, $inl
     jbe seal_sse_128
     # For longer buffers, prepare the poly key + some stream
     movdqa .chacha20_consts(%rip), $A0
     movdqu 0*16($keyp), $B0
     movdqu 1*16($keyp), $C0
     movdqu 2*16($keyp), $D0
     movdqa $A0, $A1
     movdqa $A0, $A2
     movdqa $A0, $A3
     movdqa $B0, $B1
     movdqa $B0, $B2
     movdqa $B0, $B3
     movdqa $C0, $C1
     movdqa $C0, $C2
     movdqa $C0, $C3
     movdqa $D0, $D3
     paddd .sse_inc(%rip), $D0
     movdqa $D0, $D2
     paddd .sse_inc(%rip), $D0
     movdqa $D0, $D1
     paddd .sse_inc(%rip), $D0
     # Store on stack
     movdqa $B0, $state1_store
     movdqa $C0, $state2_store
     movdqa $D0, $ctr0_store
     movdqa $D1, $ctr1_store
     movdqa $D2, $ctr2_store
     movdqa $D3, $ctr3_store
     mov \$10, $acc0
 1:  \n";
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         dec $acc0
     jnz 1b\n";
     &finalize_state(4); $code.="
     # Clamp and store the key
     pand .clamp(%rip), $A3
     movdqa $A3, $r_store
     movdqa $B3, $s_store
     # Hash
     mov %r8, $itr2
     call poly_hash_ad_internal\n";
     &xor_stream($A2,$B2,$C2,$D2,"0*16");
     &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
     cmp \$12*16, $inl
     ja 1f
     mov \$8*16, $itr1
     sub \$8*16, $inl
     lea 8*16($inp), $inp
     jmp seal_sse_128_seal_hash
 1:  \n";
     &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
     mov \$12*16, $itr1
     sub \$12*16, $inl
     lea 12*16($inp), $inp
     mov \$2, $itr1
     mov \$8, $itr2
     cmp \$4*16, $inl
     jbe seal_sse_tail_64
     cmp \$8*16, $inl
     jbe seal_sse_tail_128
     cmp \$12*16, $inl
     jbe seal_sse_tail_192

 1:  \n";
     # The main loop
         &prep_state(4); $code.="
 2:  \n";
             &emit_body(20);
             &poly_add("0($oup)");
             &emit_body(20);
             &poly_stage1();
             &emit_body(20);
             &poly_stage2();
             &emit_body(20);
             &poly_stage3();
             &emit_body(20);
             &poly_reduce_stage();
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             lea 16($oup), $oup
             dec $itr2
         jge 2b\n";
             &poly_add("0*8($oup)");
             &poly_mul(); $code.="
             lea 16($oup), $oup
             dec $itr1
         jg 2b\n";

         &finalize_state(4);$code.="
         movdqa $D2, $tmp_store\n";
         &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
         movdqa $tmp_store, $D2\n";
         &xor_stream($A2,$B2,$C2,$D2, 4*16);
         &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
         cmp \$16*16, $inl
         ja 3f

         mov \$12*16, $itr1
         sub \$12*16, $inl
         lea 12*16($inp), $inp
         jmp seal_sse_128_seal_hash
 3:  \n";
         &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
         lea 16*16($inp), $inp
         sub \$16*16, $inl
         mov \$6, $itr1
         mov \$4, $itr2
         cmp \$12*16, $inl
     jg 1b
     mov $inl, $itr1
     test $inl, $inl
     je seal_sse_128_seal_hash
     mov \$6, $itr1
     cmp \$4*16, $inl
     jg 3f
 ###############################################################################
 seal_sse_tail_64:\n";
     &prep_state(1); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
 2:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
     dec $itr1
     jg 1b
     dec $itr2
     jge 2b\n";
     &finalize_state(1); $code.="
     jmp seal_sse_128_seal
 3:
     cmp \$8*16, $inl
     jg 3f
 ###############################################################################
 seal_sse_tail_128:\n";
     &prep_state(2); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
 2:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0($oup)");
         &poly_mul();
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         lea 16($oup), $oup
     dec $itr1
     jg 1b
     dec $itr2
     jge 2b\n";
     &finalize_state(2);
     &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
     mov \$4*16, $itr1
     sub \$4*16, $inl
     lea 4*16($inp), $inp
     jmp seal_sse_128_seal_hash
 3:
 ###############################################################################
 seal_sse_tail_192:\n";
     &prep_state(3); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
 2:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
         &poly_add("0($oup)");
         &poly_mul();
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         lea 16($oup), $oup
     dec $itr1
     jg 1b
     dec $itr2
     jge 2b\n";
     &finalize_state(3);
     &xor_stream($A2,$B2,$C2,$D2,0*16);
     &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
     mov \$8*16, $itr1
     sub \$8*16, $inl
     lea 8*16($inp), $inp
 ###############################################################################
 seal_sse_128_seal_hash:
         cmp \$16, $itr1
         jb seal_sse_128_seal\n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         sub \$16, $itr1
         lea 16($oup), $oup
     jmp seal_sse_128_seal_hash

 seal_sse_128_seal:
         cmp \$16, $inl
         jb seal_sse_tail_16
         sub \$16, $inl
         # Load for decryption
         movdqu 0*16($inp), $T0
         pxor $T0, $A0
         movdqu $A0, 0*16($oup)
         # Then hash
         add 0*8($oup), $acc0
         adc 1*8($oup), $acc1
         adc \$1, $acc2
         lea 1*16($inp), $inp
         lea 1*16($oup), $oup\n";
         &poly_mul(); $code.="
         # Shift the stream left
         movdqa $B0, $A0
         movdqa $C0, $B0
         movdqa $D0, $C0
         movdqa $A1, $D0
         movdqa $B1, $A1
         movdqa $C1, $B1
         movdqa $D1, $C1
     jmp seal_sse_128_seal

 seal_sse_tail_16:
     test $inl, $inl
     jz process_blocks_of_extra_in
     # We can only load the PT one byte at a time to avoid buffer overread
     mov $inl, $itr2
     mov $inl, $itr1
     lea -1($inp, $inl), $inp
     pxor $T3, $T3
 1:
         pslldq \$1, $T3
         pinsrb \$0, ($inp), $T3
         lea -1($inp), $inp
         dec $itr1
         jne 1b

     # XOR the keystream with the plaintext.
     pxor $A0, $T3

     # Write ciphertext out, byte-by-byte.
     movq $inl, $itr1
     movdqu $T3, $A0
 2:
         pextrb \$0, $A0, ($oup)
         psrldq \$1, $A0
         add \$1, $oup
         sub \$1, $itr1
         jnz 2b

     # $T3 contains the final (partial, non-empty) block of ciphertext which
     # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
     # are valid. We need to fill it with extra_in bytes until full, or until we
     # run out of bytes.
     #
     # $keyp points to the tag output, which is actually a struct with the
     # extra_in pointer and length at offset 48.
     movq 288+32(%rsp), $keyp
     movq 56($keyp), $t1  # extra_in_len
     movq 48($keyp), $t0  # extra_in
     test $t1, $t1
     jz process_partial_block  # Common case: no bytes of extra_in

     movq \$16, $t2
     subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
     cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
                     # (note that AT&T syntax reverses the arguments)
     jge load_extra_in
     movq $t1, $t2

 load_extra_in:
     # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
     # into $T3. They are loaded in reverse order.
     leaq -1($t0, $t2), $inp
     # Update extra_in and extra_in_len to reflect the bytes that are about to
     # be read.
     addq $t2, $t0
     subq $t2, $t1
     movq $t0, 48($keyp)
     movq $t1, 56($keyp)

     # Update $itr2, which is used to select the mask later on, to reflect the
     # extra bytes about to be added.
     addq $t2, $itr2

     # Load $t2 bytes of extra_in into $T2.
     pxor $T2, $T2
 3:
         pslldq \$1, $T2
         pinsrb \$0, ($inp), $T2
         lea -1($inp), $inp
         sub \$1, $t2
         jnz 3b

     # Shift $T2 up the length of the remainder from the main encryption. Sadly,
     # the shift for an XMM register has to be a constant, thus we loop to do
     # this.
     movq $inl, $t2

 4:
         pslldq \$1, $T2
         sub \$1, $t2
         jnz 4b

     # Mask $T3 (the remainder from the main encryption) so that superfluous
     # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
     # disjoint and so we can merge them with an OR.
     lea .and_masks(%rip), $t2
     shl \$4, $inl
     pand -16($t2, $inl), $T3

     # Merge $T2 into $T3, forming the remainder block.
     por $T2, $T3

     # The block of ciphertext + extra_in is ready to be included in the
     # Poly1305 state.
     movq $T3, $t0
     pextrq \$1, $T3, $t1
     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="

 process_blocks_of_extra_in:
     # There may be additional bytes of extra_in to process.
     movq 288+32(%rsp), $keyp
     movq 48($keyp), $inp   # extra_in
     movq 56($keyp), $itr2  # extra_in_len
     movq $itr2, $itr1
     shr \$4, $itr2         # number of blocks

 5:
         jz process_extra_in_trailer\n";
         &poly_add("0($inp)");
         &poly_mul(); $code.="
         leaq 16($inp), $inp
         subq \$1, $itr2
         jmp 5b

 process_extra_in_trailer:
     andq \$15, $itr1       # remaining num bytes (<16) of extra_in
     movq $itr1, $inl
     jz do_length_block
     leaq -1($inp, $itr1), $inp

 6:
         pslldq \$1, $T3
         pinsrb \$0, ($inp), $T3
         lea -1($inp), $inp
         sub \$1, $itr1
         jnz 6b

 process_partial_block:
     # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
     lea .and_masks(%rip), $t2
     shl \$4, $inl
     pand -16($t2, $inl), $T3
     movq $T3, $t0
     pextrq \$1, $T3, $t1
     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="

 do_length_block:\n";
     &poly_add($len_store);
     &poly_mul(); $code.="
     # Final reduce
     mov $acc0, $t0
     mov $acc1, $t1
     mov $acc2, $t2
     sub \$-5, $acc0
     sbb \$-1, $acc1
     sbb \$3, $acc2
     cmovc $t0, $acc0
     cmovc $t1, $acc1
     cmovc $t2, $acc2
     # Add in s part of the key
     add 0+$s_store, $acc0
     adc 8+$s_store, $acc1

     add \$288 + 32, %rsp
 .cfi_adjust_cfa_offset -(288 + 32)
     pop $keyp
 .cfi_adjust_cfa_offset -8
     mov $acc0, 0*8($keyp)
     mov $acc1, 1*8($keyp)

     pop %r15
 .cfi_adjust_cfa_offset -8
     pop %r14
 .cfi_adjust_cfa_offset -8
     pop %r13
 .cfi_adjust_cfa_offset -8
     pop %r12
 .cfi_adjust_cfa_offset -8
     pop %rbx
 .cfi_adjust_cfa_offset -8
     pop %rbp
 .cfi_adjust_cfa_offset -8
     ret
 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
 ################################################################################
 seal_sse_128:
     movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
     movdqu 2*16($keyp), $D2
     movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
     movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
     mov \$10, $acc0
 1:\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
     jnz 1b
     paddd .chacha20_consts(%rip), $A0
     paddd .chacha20_consts(%rip), $A1
     paddd .chacha20_consts(%rip), $A2
     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
     paddd $T2, $C0\npaddd $T2, $C1
     paddd $T3, $D0
     paddd .sse_inc(%rip), $T3
     paddd $T3, $D1
     # Clamp and store the key
     pand .clamp(%rip), $A2
     movdqa $A2, $r_store
     movdqa $B2, $s_store
     # Hash
     mov %r8, $itr2
     call poly_hash_ad_internal
     jmp seal_sse_128_seal
 .size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
 }

 # There should have been a cfi_endproc at the end of that function, but the two
 # following blocks of code are jumped to without a stack frame and the CFI
 # context which they are used in happens to match the CFI context at the end of
 # the previous function. So the CFI table is just extended to the end of them.

 if ($avx>1) {

 ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
 my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
 ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
 $state1_store="2*32(%rbp)";
 $state2_store="3*32(%rbp)";
 $tmp_store="4*32(%rbp)";
 $ctr0_store="5*32(%rbp)";
 $ctr1_store="6*32(%rbp)";
 $ctr2_store="7*32(%rbp)";
 $ctr3_store="8*32(%rbp)";

 sub chacha_qr_avx2 {
 my ($a,$b,$c,$d,$t,$dir)=@_;
 $code.=<<___ if ($dir =~ /store/);
     vmovdqa $t, $tmp_store
 ___
 $code.=<<___;
     vpaddd $b, $a, $a
     vpxor $a, $d, $d
     vpshufb .rol16(%rip), $d, $d
     vpaddd $d, $c, $c
     vpxor $c, $b, $b
     vpsrld \$20, $b, $t
     vpslld \$12, $b, $b
     vpxor $t, $b, $b
     vpaddd $b, $a, $a
     vpxor $a, $d, $d
     vpshufb .rol8(%rip), $d, $d
     vpaddd $d, $c, $c
     vpxor $c, $b, $b
     vpslld \$7, $b, $t
     vpsrld \$25, $b, $b
     vpxor $t, $b, $b
 ___
 $code.=<<___ if ($dir =~ /left/);
     vpalignr \$12, $d, $d, $d
     vpalignr \$8, $c, $c, $c
     vpalignr \$4, $b, $b, $b
 ___
 $code.=<<___ if ($dir =~ /right/);
     vpalignr \$4, $d, $d, $d
     vpalignr \$8, $c, $c, $c
     vpalignr \$12, $b, $b, $b
 ___
 $code.=<<___ if ($dir =~ /load/);
     vmovdqa $tmp_store, $t
 ___
 }

 sub prep_state_avx2 {
 my ($n)=@_;
 $code.=<<___;
     vmovdqa .chacha20_consts(%rip), $A0
     vmovdqa $state1_store, $B0
     vmovdqa $state2_store, $C0
 ___
 $code.=<<___ if ($n ge 2);
     vmovdqa $A0, $A1
     vmovdqa $B0, $B1
     vmovdqa $C0, $C1
 ___
 $code.=<<___ if ($n ge 3);
     vmovdqa $A0, $A2
     vmovdqa $B0, $B2
     vmovdqa $C0, $C2
 ___
 $code.=<<___ if ($n ge 4);
     vmovdqa $A0, $A3
     vmovdqa $B0, $B3
     vmovdqa $C0, $C3
 ___
 $code.=<<___ if ($n eq 1);
     vmovdqa .avx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D0
     vmovdqa $D0, $ctr0_store
 ___
 $code.=<<___ if ($n eq 2);
     vmovdqa .avx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D1
     vpaddd $D1, $D0, $D0
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
 ___
 $code.=<<___ if ($n eq 3);
     vmovdqa .avx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D2
     vpaddd $D2, $D0, $D1
     vpaddd $D1, $D0, $D0
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
 ___
 $code.=<<___ if ($n eq 4);
     vmovdqa .avx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D3
     vpaddd $D3, $D0, $D2
     vpaddd $D2, $D0, $D1
     vpaddd $D1, $D0, $D0
     vmovdqa $D3, $ctr3_store
     vmovdqa $D2, $ctr2_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D0, $ctr0_store
 ___
 }

 sub finalize_state_avx2 {
 my ($n)=@_;
 $code.=<<___ if ($n eq 4);
     vpaddd .chacha20_consts(%rip), $A3, $A3
     vpaddd $state1_store, $B3, $B3
     vpaddd $state2_store, $C3, $C3
     vpaddd $ctr3_store, $D3, $D3
 ___
 $code.=<<___ if ($n ge 3);
     vpaddd .chacha20_consts(%rip), $A2, $A2
     vpaddd $state1_store, $B2, $B2
     vpaddd $state2_store, $C2, $C2
     vpaddd $ctr2_store, $D2, $D2
 ___
 $code.=<<___ if ($n ge 2);
     vpaddd .chacha20_consts(%rip), $A1, $A1
     vpaddd $state1_store, $B1, $B1
     vpaddd $state2_store, $C1, $C1
     vpaddd $ctr1_store, $D1, $D1
 ___
 $code.=<<___;
     vpaddd .chacha20_consts(%rip), $A0, $A0
     vpaddd $state1_store, $B0, $B0
     vpaddd $state2_store, $C0, $C0
     vpaddd $ctr0_store, $D0, $D0
 ___
 }

 sub xor_stream_avx2 {
 my ($A, $B, $C, $D, $offset, $hlp)=@_;
 $code.=<<___;
     vperm2i128 \$0x02, $A, $B, $hlp
     vperm2i128 \$0x13, $A, $B, $B
     vperm2i128 \$0x02, $C, $D, $A
     vperm2i128 \$0x13, $C, $D, $C
     vpxor 0*32+$offset($inp), $hlp, $hlp
     vpxor 1*32+$offset($inp), $A, $A
     vpxor 2*32+$offset($inp), $B, $B
     vpxor 3*32+$offset($inp), $C, $C
     vmovdqu $hlp, 0*32+$offset($oup)
     vmovdqu $A, 1*32+$offset($oup)
     vmovdqu $B, 2*32+$offset($oup)
     vmovdqu $C, 3*32+$offset($oup)
 ___
 }

 sub finish_stream_avx2 {
 my ($A, $B, $C, $D, $hlp)=@_;
 $code.=<<___;
     vperm2i128 \$0x13, $A, $B, $hlp
     vperm2i128 \$0x02, $A, $B, $A
     vperm2i128 \$0x02, $C, $D, $B
     vperm2i128 \$0x13, $C, $D, $D
     vmovdqa $hlp, $C
 ___
 }

 sub poly_stage1_mulx {
 $code.=<<___;
     mov 0+$r_store, %rdx
     mov %rdx, $t2
     mulx $acc0, $t0, $t1
     mulx $acc1, %rax, %rdx
     imulq $acc2, $t2
     add %rax, $t1
     adc %rdx, $t2
 ___
 }

 sub poly_stage2_mulx {
 $code.=<<___;
     mov 8+$r_store, %rdx
     mulx $acc0, $acc0, %rax
     add $acc0, $t1
     mulx $acc1, $acc1, $t3
     adc $acc1, $t2
     adc \$0, $t3
     imulq $acc2, %rdx
 ___
 }

 sub poly_stage3_mulx {
 $code.=<<___;
     add %rax, $t2
     adc %rdx, $t3
 ___
 }

 sub poly_mul_mulx {
     &poly_stage1_mulx();
     &poly_stage2_mulx();
     &poly_stage3_mulx();
     &poly_reduce_stage();
 }

 sub gen_chacha_round_avx2 {
 my ($rot1, $rot2, $shift)=@_;
 my $round="";
 $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
 $round=$round ."vmovdqa $rot2, $C0
                 vpaddd $B3, $A3, $A3
                 vpaddd $B2, $A2, $A2
                 vpaddd $B1, $A1, $A1
                 vpaddd $B0, $A0, $A0
                 vpxor $A3, $D3, $D3
                 vpxor $A2, $D2, $D2
                 vpxor $A1, $D1, $D1
                 vpxor $A0, $D0, $D0
                 vpshufb $C0, $D3, $D3
                 vpshufb $C0, $D2, $D2
                 vpshufb $C0, $D1, $D1
                 vpshufb $C0, $D0, $D0
                 vmovdqa $tmp_store, $C0
                 vpaddd $D3, $C3, $C3
                 vpaddd $D2, $C2, $C2
                 vpaddd $D1, $C1, $C1
                 vpaddd $D0, $C0, $C0
                 vpxor $C3, $B3, $B3
                 vpxor $C2, $B2, $B2
                 vpxor $C1, $B1, $B1
                 vpxor $C0, $B0, $B0
                 vmovdqa $C0, $tmp_store
                 vpsrld \$$rot1, $B3, $C0
                 vpslld \$32-$rot1, $B3, $B3
                 vpxor $C0, $B3, $B3
                 vpsrld \$$rot1, $B2, $C0
                 vpslld \$32-$rot1, $B2, $B2
                 vpxor $C0, $B2, $B2
                 vpsrld \$$rot1, $B1, $C0
                 vpslld \$32-$rot1, $B1, $B1
                 vpxor $C0, $B1, $B1
                 vpsrld \$$rot1, $B0, $C0
                 vpslld \$32-$rot1, $B0, $B0
                 vpxor $C0, $B0, $B0\n";
 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
 $round=$round ."vmovdqa $tmp_store, $C0
                 vpalignr \$$s1, $B3, $B3, $B3
                 vpalignr \$$s2, $C3, $C3, $C3
                 vpalignr \$$s3, $D3, $D3, $D3
                 vpalignr \$$s1, $B2, $B2, $B2
                 vpalignr \$$s2, $C2, $C2, $C2
                 vpalignr \$$s3, $D2, $D2, $D2
                 vpalignr \$$s1, $B1, $B1, $B1
                 vpalignr \$$s2, $C1, $C1, $C1
                 vpalignr \$$s3, $D1, $D1, $D1
                 vpalignr \$$s1, $B0, $B0, $B0
                 vpalignr \$$s2, $C0, $C0, $C0
                 vpalignr \$$s3, $D0, $D0, $D0\n"
 if (($shift =~ /left/) || ($shift =~ /right/));
 return $round;
 };

 $chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
                &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
                &gen_chacha_round_avx2(20, ".rol16(%rip)") .
                &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");

 @loop_body = split /\n/, $chacha_body;

 $code.="
 ###############################################################################
 .type chacha20_poly1305_open_avx2,\@function,2
 .align 64
 chacha20_poly1305_open_avx2:
     vzeroupper
     vmovdqa .chacha20_consts(%rip), $A0
     vbroadcasti128 0*16($keyp), $B0
     vbroadcasti128 1*16($keyp), $C0
     vbroadcasti128 2*16($keyp), $D0
     vpaddd .avx2_init(%rip), $D0, $D0
     cmp \$6*32, $inl
     jbe open_avx2_192
     cmp \$10*32, $inl
     jbe open_avx2_320

     vmovdqa $B0, $state1_store
     vmovdqa $C0, $state2_store
     vmovdqa $D0, $ctr0_store
     mov \$10, $acc0
 1:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     vpaddd .chacha20_consts(%rip), $A0, $A0
     vpaddd $state1_store, $B0, $B0
     vpaddd $state2_store, $C0, $C0
     vpaddd $ctr0_store, $D0, $D0

     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store key
     vpand .clamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for the first 64 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     # Hash AD + first 64 bytes
     mov %r8, $itr2
     call poly_hash_ad_internal
     xor $itr1, $itr1
     # Hash first 64 bytes
 1:  \n";
        &poly_add("0($inp, $itr1)");
        &poly_mul(); $code.="
        add \$16, $itr1
        cmp \$2*32, $itr1
     jne 1b
     # Decrypt first 64 bytes
     vpxor 0*32($inp), $A0, $A0
     vpxor 1*32($inp), $B0, $B0
     vmovdqu $A0, 0*32($oup)
     vmovdqu $B0, 1*32($oup)
     lea 2*32($inp), $inp
     lea 2*32($oup), $oup
     sub \$2*32, $inl
 1:
         # Hash and decrypt 512 bytes each iteration
         cmp \$16*32, $inl
         jb 3f\n";
         &prep_state_avx2(4); $code.="
         xor $itr1, $itr1
 2:  \n";
             &poly_add("0*8($inp, $itr1)");
             &emit_body(10);
             &poly_stage1_mulx();
             &emit_body(9);
             &poly_stage2_mulx();
             &emit_body(12);
             &poly_stage3_mulx();
             &emit_body(10);
             &poly_reduce_stage();
             &emit_body(9);
             &poly_add("2*8($inp, $itr1)");
             &emit_body(8);
             &poly_stage1_mulx();
             &emit_body(18);
             &poly_stage2_mulx();
             &emit_body(18);
             &poly_stage3_mulx();
             &emit_body(9);
             &poly_reduce_stage();
             &emit_body(8);
             &poly_add("4*8($inp, $itr1)"); $code.="
             lea 6*8($itr1), $itr1\n";
             &emit_body(18);
             &poly_stage1_mulx();
             &emit_body(8);
             &poly_stage2_mulx();
             &emit_body(8);
             &poly_stage3_mulx();
             &emit_body(18);
             &poly_reduce_stage();
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             cmp \$10*6*8, $itr1
         jne 2b\n";
         &finalize_state_avx2(4); $code.="
         vmovdqa $A0, $tmp_store\n";
         &poly_add("10*6*8($inp)");
         &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
         vmovdqa $tmp_store, $A0\n";
         &poly_mul();
         &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
         &poly_add("10*6*8+2*8($inp)");
         &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
         &poly_mul();
         &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
         lea 16*32($inp), $inp
         lea 16*32($oup), $oup
         sub \$16*32, $inl
     jmp 1b
 3:
     test $inl, $inl
     vzeroupper
     je open_sse_finalize
 3:
     cmp \$4*32, $inl
     ja 3f\n";
 ###############################################################################
     # 1-128 bytes left
     &prep_state_avx2(1); $code.="
     xor $itr2, $itr2
     mov $inl, $itr1
     and \$-16, $itr1
     test $itr1, $itr1
     je 2f
 1:  \n";
         &poly_add("0*8($inp, $itr2)");
         &poly_mul(); $code.="
 2:
         add \$16, $itr2\n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp $itr1, $itr2
     jb 1b
         cmp \$160, $itr2
     jne 2b\n";
     &finalize_state_avx2(1);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     jmp open_avx2_tail_loop
 3:
     cmp \$8*32, $inl
     ja 3f\n";
 ###############################################################################
     # 129-256 bytes left
     &prep_state_avx2(2); $code.="
     mov $inl, $tmp_store
     mov $inl, $itr1
     sub \$4*32, $itr1
     shr \$4, $itr1
     mov \$10, $itr2
     cmp \$10, $itr1
     cmovg $itr2, $itr1
     mov $inp, $inl
     xor $itr2, $itr2
 1:  \n";
         &poly_add("0*8($inl)");
         &poly_mul_mulx(); $code.="
         lea 16($inl), $inl
 2:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
         inc $itr2\n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         cmp $itr1, $itr2
     jb 1b
         cmp \$10, $itr2
     jne 2b
     mov $inl, $itr2
     sub $inp, $inl
     mov $inl, $itr1
     mov $tmp_store, $inl
 1:
         add \$16, $itr1
         cmp $inl, $itr1
         jg 1f\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 16($itr2), $itr2
     jmp 1b
 1:  \n";
     &finalize_state_avx2(2);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
     &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
     lea 4*32($inp), $inp
     lea 4*32($oup), $oup
     sub \$4*32, $inl
     jmp open_avx2_tail_loop
 3:
     cmp \$12*32, $inl
     ja 3f\n";
 ###############################################################################
     # 257-383 bytes left
     &prep_state_avx2(3); $code.="
     mov $inl, $tmp_store
     mov $inl, $itr1
     sub \$8*32, $itr1
     shr \$4, $itr1
     add \$6, $itr1
     mov \$10, $itr2
     cmp \$10, $itr1
     cmovg $itr2, $itr1
     mov $inp, $inl
     xor $itr2, $itr2
 1:  \n";
         &poly_add("0*8($inl)");
         &poly_mul_mulx(); $code.="
         lea 16($inl), $inl
 2:  \n";
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &poly_add("0*8($inl)");
         &poly_mul(); $code.="
         lea 16($inl), $inl
         inc $itr2\n";
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp $itr1, $itr2
     jb 1b
         cmp \$10, $itr2
     jne 2b
     mov $inl, $itr2
     sub $inp, $inl
     mov $inl, $itr1
     mov $tmp_store, $inl
 1:
         add \$16, $itr1
         cmp $inl, $itr1
         jg 1f\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 16($itr2), $itr2
     jmp 1b
 1:  \n";
     &finalize_state_avx2(3);
     &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
     &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
     lea 8*32($inp), $inp
     lea 8*32($oup), $oup
     sub \$8*32, $inl
     jmp open_avx2_tail_loop
 3:  \n";
 ###############################################################################
     # 384-512 bytes left
     &prep_state_avx2(4); $code.="
     xor $itr1, $itr1
     mov $inp, $itr2
 1:  \n";
         &poly_add("0*8($itr2)");
         &poly_mul(); $code.="
         lea 2*8($itr2), $itr2
 2:  \n";
         &emit_body(37);
         &poly_add("0*8($itr2)");
         &poly_mul_mulx();
         &emit_body(48);
         &poly_add("2*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 4*8($itr2), $itr2\n";
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         inc $itr1
         cmp \$4, $itr1
     jl  1b
         cmp \$10, $itr1
     jne 2b
     mov $inl, $itr1
     sub \$12*32, $itr1
     and \$-16, $itr1
 1:
         test $itr1, $itr1
         je 1f\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 2*8($itr2), $itr2
         sub \$2*8, $itr1
     jmp 1b
 1:  \n";
     &finalize_state_avx2(4); $code.="
     vmovdqa $A0, $tmp_store\n";
     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
     vmovdqa $tmp_store, $A0\n";
     &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
     &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
     lea 12*32($inp), $inp
     lea 12*32($oup), $oup
     sub \$12*32, $inl
 open_avx2_tail_loop:
     cmp \$32, $inl
     jb open_avx2_tail
         sub \$32, $inl
         vpxor ($inp), $A0, $A0
         vmovdqu $A0, ($oup)
         lea 1*32($inp), $inp
         lea 1*32($oup), $oup
         vmovdqa $B0, $A0
         vmovdqa $C0, $B0
         vmovdqa $D0, $C0
     jmp open_avx2_tail_loop
 open_avx2_tail:
     cmp \$16, $inl
     vmovdqa $A0x, $A1x
     jb 1f
     sub \$16, $inl
     #load for decryption
     vpxor ($inp), $A0x, $A1x
     vmovdqu $A1x, ($oup)
     lea 1*16($inp), $inp
     lea 1*16($oup), $oup
     vperm2i128 \$0x11, $A0, $A0, $A0
     vmovdqa $A0x, $A1x
 1:
     vzeroupper
     jmp open_sse_tail_16
 ###############################################################################
 open_avx2_192:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
     vpaddd .avx2_inc(%rip), $D0, $D1
     vmovdqa $D0, $T2
     vmovdqa $D1, $T3
     mov \$10, $acc0
 1:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     vpaddd $A2, $A0, $A0
     vpaddd $A2, $A1, $A1
     vpaddd $B2, $B0, $B0
     vpaddd $B2, $B1, $B1
     vpaddd $C2, $C0, $C0
     vpaddd $C2, $C1, $C1
     vpaddd $T2, $D0, $D0
     vpaddd $T3, $D1, $D1
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
     vpand .clamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 192 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     vperm2i128 \$0x02, $A1, $B1, $C0
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
 open_avx2_short:
     mov %r8, $itr2
     call poly_hash_ad_internal
 open_avx2_hash_and_xor_loop:
         cmp \$32, $inl
         jb open_avx2_short_tail_32
         sub \$32, $inl\n";
         # Load + hash
         &poly_add("0*8($inp)");
         &poly_mul();
         &poly_add("2*8($inp)");
         &poly_mul(); $code.="
         # Load + decrypt
         vpxor ($inp), $A0, $A0
         vmovdqu $A0, ($oup)
         lea 1*32($inp), $inp
         lea 1*32($oup), $oup
         # Shift stream
         vmovdqa $B0, $A0
         vmovdqa $C0, $B0
         vmovdqa $D0, $C0
         vmovdqa $A1, $D0
         vmovdqa $B1, $A1
         vmovdqa $C1, $B1
         vmovdqa $D1, $C1
         vmovdqa $A2, $D1
         vmovdqa $B2, $A2
     jmp open_avx2_hash_and_xor_loop
 open_avx2_short_tail_32:
     cmp \$16, $inl
     vmovdqa $A0x, $A1x
     jb 1f
     sub \$16, $inl\n";
     &poly_add("0*8($inp)");
     &poly_mul(); $code.="
     vpxor ($inp), $A0x, $A3x
     vmovdqu $A3x, ($oup)
     lea 1*16($inp), $inp
     lea 1*16($oup), $oup
     vextracti128 \$1, $A0, $A1x
 1:
     vzeroupper
     jmp open_sse_tail_16
 ###############################################################################
 open_avx2_320:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
     vpaddd .avx2_inc(%rip), $D0, $D1
     vpaddd .avx2_inc(%rip), $D1, $D2
     vmovdqa $B0, $T1
     vmovdqa $C0, $T2
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     mov \$10, $acc0
 1:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     vpaddd .chacha20_consts(%rip), $A0, $A0
     vpaddd .chacha20_consts(%rip), $A1, $A1
     vpaddd .chacha20_consts(%rip), $A2, $A2
     vpaddd $T1, $B0, $B0
     vpaddd $T1, $B1, $B1
     vpaddd $T1, $B2, $B2
     vpaddd $T2, $C0, $C0
     vpaddd $T2, $C1, $C1
     vpaddd $T2, $C2, $C2
     vpaddd $ctr0_store, $D0, $D0
     vpaddd $ctr1_store, $D1, $D1
     vpaddd $ctr2_store, $D2, $D2
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
     vpand .clamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 320 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     vperm2i128 \$0x02, $A1, $B1, $C0
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
     vperm2i128 \$0x02, $A2, $B2, $C1
     vperm2i128 \$0x02, $C2, $D2, $D1
     vperm2i128 \$0x13, $A2, $B2, $A2
     vperm2i128 \$0x13, $C2, $D2, $B2
     jmp open_avx2_short
 .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
 ###############################################################################
 ###############################################################################
 .type chacha20_poly1305_seal_avx2,\@function,2
 .align 64
 chacha20_poly1305_seal_avx2:
     vzeroupper
     vmovdqa .chacha20_consts(%rip), $A0
     vbroadcasti128 0*16($keyp), $B0
     vbroadcasti128 1*16($keyp), $C0
     vbroadcasti128 2*16($keyp), $D0
     vpaddd .avx2_init(%rip), $D0, $D0
     cmp \$6*32, $inl
     jbe seal_avx2_192
     cmp \$10*32, $inl
     jbe seal_avx2_320
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $A0, $A3
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $B0, $B3
     vmovdqa $B0, $state1_store
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
     vmovdqa $C0, $C3
     vmovdqa $C0, $state2_store
     vmovdqa $D0, $D3
     vpaddd .avx2_inc(%rip), $D3, $D2
     vpaddd .avx2_inc(%rip), $D2, $D1
     vpaddd .avx2_inc(%rip), $D1, $D0
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     vmovdqa $D3, $ctr3_store
     mov \$10, $acc0
 1:  \n";
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         dec $acc0
         jnz 1b\n";
     &finalize_state_avx2(4); $code.="
     vperm2i128 \$0x13, $C3, $D3, $C3
     vperm2i128 \$0x02, $A3, $B3, $D3
     vperm2i128 \$0x13, $A3, $B3, $A3
     vpand .clamp(%rip), $D3, $D3
     vmovdqa $D3, $r_store
     mov %r8, $itr2
     call poly_hash_ad_internal
     # Safely store 320 bytes (otherwise would handle with optimized call)
     vpxor 0*32($inp), $A3, $A3
     vpxor 1*32($inp), $C3, $C3
     vmovdqu $A3, 0*32($oup)
     vmovdqu $C3, 1*32($oup)\n";
     &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
     &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
     lea 10*32($inp), $inp
     sub \$10*32, $inl
     mov \$10*32, $itr1
     cmp \$4*32, $inl
     jbe seal_avx2_hash
     vpxor 0*32($inp), $A0, $A0
     vpxor 1*32($inp), $B0, $B0
     vpxor 2*32($inp), $C0, $C0
     vpxor 3*32($inp), $D0, $D0
     vmovdqu $A0, 10*32($oup)
     vmovdqu $B0, 11*32($oup)
     vmovdqu $C0, 12*32($oup)
     vmovdqu $D0, 13*32($oup)
     lea 4*32($inp), $inp
     sub \$4*32, $inl
     mov \$8, $itr1
     mov \$2, $itr2
     cmp \$4*32, $inl
     jbe seal_avx2_tail_128
     cmp \$8*32, $inl
     jbe seal_avx2_tail_256
     cmp \$12*32, $inl
     jbe seal_avx2_tail_384
     cmp \$16*32, $inl
     jbe seal_avx2_tail_512\n";
     # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
     &prep_state_avx2(4);
     foreach $l (@loop_body) {$code.=$l."\n";}
     @loop_body = split /\n/, $chacha_body;
     &emit_body(41);
     @loop_body = split /\n/, $chacha_body; $code.="
     sub \$16, $oup
     mov \$9, $itr1
     jmp 4f
 1:  \n";
         &prep_state_avx2(4); $code.="
         mov \$10, $itr1
 2:  \n";
             &poly_add("0*8($oup)");
             &emit_body(10);
             &poly_stage1_mulx();
             &emit_body(9);
             &poly_stage2_mulx();
             &emit_body(12);
             &poly_stage3_mulx();
             &emit_body(10);
             &poly_reduce_stage(); $code.="
 4:  \n";
             &emit_body(9);
             &poly_add("2*8($oup)");
             &emit_body(8);
             &poly_stage1_mulx();
             &emit_body(18);
             &poly_stage2_mulx();
             &emit_body(18);
             &poly_stage3_mulx();
             &emit_body(9);
             &poly_reduce_stage();
             &emit_body(8);
             &poly_add("4*8($oup)"); $code.="
             lea 6*8($oup), $oup\n";
             &emit_body(18);
             &poly_stage1_mulx();
             &emit_body(8);
             &poly_stage2_mulx();
             &emit_body(8);
             &poly_stage3_mulx();
             &emit_body(18);
             &poly_reduce_stage();
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             dec $itr1
         jne 2b\n";
         &finalize_state_avx2(4); $code.="
         lea 4*8($oup), $oup
         vmovdqa $A0, $tmp_store\n";
         &poly_add("-4*8($oup)");
         &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
         vmovdqa $tmp_store, $A0\n";
         &poly_mul();
         &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
         &poly_add("-2*8($oup)");
         &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
         &poly_mul();
         &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
         lea 16*32($inp), $inp
         sub \$16*32, $inl
         cmp \$16*32, $inl
     jg 1b\n";
     &poly_add("0*8($oup)");
     &poly_mul();
     &poly_add("2*8($oup)");
     &poly_mul(); $code.="
     lea 4*8($oup), $oup
     mov \$10, $itr1
     xor $itr2, $itr2
     cmp \$4*32, $inl
     ja 3f
 ###############################################################################
 seal_avx2_tail_128:\n";
     &prep_state_avx2(1); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 2*8($oup), $oup
 2:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &poly_add("0*8($oup)");
         &poly_mul();
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &poly_add("2*8($oup)");
         &poly_mul(); $code.="
         lea 4*8($oup), $oup
         dec $itr1
     jg 1b
         dec $itr2
     jge 2b\n";
     &finalize_state_avx2(1);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     jmp seal_avx2_short_loop
 3:
     cmp \$8*32, $inl
     ja 3f
 ###############################################################################
 seal_avx2_tail_256:\n";
     &prep_state_avx2(2); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 2*8($oup), $oup
 2:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0*8($oup)");
         &poly_mul();
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &poly_add("2*8($oup)");
         &poly_mul(); $code.="
         lea 4*8($oup), $oup
         dec $itr1
     jg 1b
         dec $itr2
     jge 2b\n";
     &finalize_state_avx2(2);
     &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     mov \$4*32, $itr1
     lea 4*32($inp), $inp
     sub \$4*32, $inl
     jmp seal_avx2_hash
 3:
     cmp \$12*32, $inl
     ja seal_avx2_tail_512
 ###############################################################################
 seal_avx2_tail_384:\n";
     &prep_state_avx2(3); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 2*8($oup), $oup
 2:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0*8($oup)");
         &poly_mul();
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &poly_add("2*8($oup)");
         &poly_mul();
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         lea 4*8($oup), $oup
         dec $itr1
     jg 1b
         dec $itr2
     jge 2b\n";
     &finalize_state_avx2(3);
     &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
     &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     mov \$8*32, $itr1
     lea 8*32($inp), $inp
     sub \$8*32, $inl
     jmp seal_avx2_hash
 ###############################################################################
 seal_avx2_tail_512:\n";
     &prep_state_avx2(4); $code.="
 1:  \n";
         &poly_add("0($oup)");
         &poly_mul_mulx(); $code.="
         lea 2*8($oup), $oup
 2:  \n";
         &emit_body(20);
         &poly_add("0*8($oup)");
         &emit_body(20);
         &poly_stage1_mulx();
         &emit_body(20);
         &poly_stage2_mulx();
         &emit_body(20);
         &poly_stage3_mulx();
         &emit_body(20);
         &poly_reduce_stage();
         &emit_body(20);
         &poly_add("2*8($oup)");
         &emit_body(20);
         &poly_stage1_mulx();
         &emit_body(20);
         &poly_stage2_mulx();
         &emit_body(20);
         &poly_stage3_mulx();
         &emit_body(20);
         &poly_reduce_stage();
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         lea 4*8($oup), $oup
         dec $itr1
     jg 1b
         dec $itr2
     jge 2b\n";
     &finalize_state_avx2(4); $code.="
     vmovdqa $A0, $tmp_store\n";
     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
     vmovdqa $tmp_store, $A0\n";
     &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     mov \$12*32, $itr1
     lea 12*32($inp), $inp
     sub \$12*32, $inl
     jmp seal_avx2_hash
 ################################################################################
 seal_avx2_320:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
     vpaddd .avx2_inc(%rip), $D0, $D1
     vpaddd .avx2_inc(%rip), $D1, $D2
     vmovdqa $B0, $T1
     vmovdqa $C0, $T2
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     mov \$10, $acc0
 1:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     vpaddd .chacha20_consts(%rip), $A0, $A0
     vpaddd .chacha20_consts(%rip), $A1, $A1
     vpaddd .chacha20_consts(%rip), $A2, $A2
     vpaddd $T1, $B0, $B0
     vpaddd $T1, $B1, $B1
     vpaddd $T1, $B2, $B2
     vpaddd $T2, $C0, $C0
     vpaddd $T2, $C1, $C1
     vpaddd $T2, $C2, $C2
     vpaddd $ctr0_store, $D0, $D0
     vpaddd $ctr1_store, $D1, $D1
     vpaddd $ctr2_store, $D2, $D2
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
     vpand .clamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 320 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     vperm2i128 \$0x02, $A1, $B1, $C0
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
     vperm2i128 \$0x02, $A2, $B2, $C1
     vperm2i128 \$0x02, $C2, $D2, $D1
     vperm2i128 \$0x13, $A2, $B2, $A2
     vperm2i128 \$0x13, $C2, $D2, $B2
     jmp seal_avx2_short
 ################################################################################
 seal_avx2_192:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
     vpaddd .avx2_inc(%rip), $D0, $D1
     vmovdqa $D0, $T2
     vmovdqa $D1, $T3
     mov \$10, $acc0
 1:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         dec $acc0
     jne 1b
     vpaddd $A2, $A0, $A0
     vpaddd $A2, $A1, $A1
     vpaddd $B2, $B0, $B0
     vpaddd $B2, $B1, $B1
     vpaddd $C2, $C0, $C0
     vpaddd $C2, $C1, $C1
     vpaddd $T2, $D0, $D0
     vpaddd $T3, $D1, $D1
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
     vpand .clamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 192 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     vperm2i128 \$0x02, $A1, $B1, $C0
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
 seal_avx2_short:
     mov %r8, $itr2
     call poly_hash_ad_internal
     xor $itr1, $itr1
 seal_avx2_hash:
         cmp \$16, $itr1
         jb seal_avx2_short_loop\n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         sub \$16, $itr1
         add \$16, $oup
     jmp seal_avx2_hash
 seal_avx2_short_loop:
         cmp \$32, $inl
         jb seal_avx2_short_tail
         sub \$32, $inl
         # Encrypt
         vpxor ($inp), $A0, $A0
         vmovdqu $A0, ($oup)
         lea 1*32($inp), $inp
         # Load + hash\n";
         &poly_add("0*8($oup)");
         &poly_mul();
         &poly_add("2*8($oup)");
         &poly_mul(); $code.="
         lea 1*32($oup), $oup
         # Shift stream
         vmovdqa $B0, $A0
         vmovdqa $C0, $B0
         vmovdqa $D0, $C0
         vmovdqa $A1, $D0
         vmovdqa $B1, $A1
         vmovdqa $C1, $B1
         vmovdqa $D1, $C1
         vmovdqa $A2, $D1
         vmovdqa $B2, $A2
     jmp seal_avx2_short_loop
 seal_avx2_short_tail:
     cmp \$16, $inl
     jb 1f
     sub \$16, $inl
     vpxor ($inp), $A0x, $A3x
     vmovdqu $A3x, ($oup)
     lea 1*16($inp), $inp\n";
     &poly_add("0*8($oup)");
     &poly_mul(); $code.="
     lea 1*16($oup), $oup
     vextracti128 \$1, $A0, $A0x
 1:
     vzeroupper
     jmp seal_sse_tail_16
 .cfi_endproc
 ";
 }

 if (!$win64) {
   $code =~ s/\`([^\`]*)\`/eval $1/gem;
   print $code;
 } else {
   print <<___;
 .globl dummy_chacha20_poly1305_asm
 .type dummy_chacha20_poly1305_asm,\@abi-omnipotent
 dummy_chacha20_poly1305_asm:
     ret
 ___
 }

 close STDOUT;