blob: aa360293864b5294cd9469efa39dc33b7308b0f1 [file] [log] [blame]
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
# loops are aggressively modulo-scheduled in respect to references to
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
# scheduling "glitch," because uprofile(1) indicates uniform sample
# distribution, as if all instruction bundles execute in 1.5 cycles.
# Meaning that it could have been even faster, yet 12 cycles is ~60%
# better than gcc-generated code and ~80% than code generated by vendor
# compiler.
$cnt="v0"; # $0
$t0="t0";
$t1="t1";
$t2="t2";
$Thi0="t3"; # $4
$Tlo0="t4";
$Thi1="t5";
$Tlo1="t6";
$rem="t7"; # $8
#################
$Xi="a0"; # $16, input argument block
$Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4"; # $20
$nhi="a5";
$Zhi="t8";
$Zlo="t9";
$Xhi="t10"; # $24
$Xlo="t11";
$remp="t12";
$rem_4bit="AT"; # $28
{ my $N;
sub loop() {
$N++;
$code.=<<___;
.align 4
extbl $Xlo,7,$nlo
and $nlo,0xf0,$nhi
sll $nlo,4,$nlo
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Zlo,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Zhi,0($nlo)
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,6(zero)
extbl $Xlo,6,$nlo
ldq $Tlo1,8($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $Thi1,0($nhi)
srl $Zlo,4,$Zlo
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
and $nlo,0xf0,$nhi
xor $Tlo1,$Zlo,$Zlo
sll $nlo,4,$nlo
xor $Thi1,$Zhi,$Zhi
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Tlo0,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Thi0,0($nlo)
.Looplo$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xlo,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Looplo$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,7(zero)
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
unop
.Loophi$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Loophi$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
xor $Tlo0,$Zlo,$Zlo
xor $Thi0,$Zhi,$Zhi
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
xor $t0,$Zlo,$Zlo
xor $rem,$Zhi,$Zhi
___
}}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl gcm_gmult_4bit
.align 4
.ent gcm_gmult_4bit
gcm_gmult_4bit:
.frame sp,0,ra
.prologue 0
ldq $Xlo,8($Xi)
ldq $Xhi,0($Xi)
bsr $t0,picmeup
nop
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
ret (ra)
.end gcm_gmult_4bit
___
$inhi="s0";
$inlo="s1";
$code.=<<___;
.globl gcm_ghash_4bit
.align 4
.ent gcm_ghash_4bit
gcm_ghash_4bit:
lda sp,-32(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
.mask 0x04000600,-32
.frame sp,32,ra
.prologue 0
ldq_u $inhi,0($inp)
ldq_u $Thi0,7($inp)
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
ldq $Xhi,0($Xi)
ldq $Xlo,8($Xi)
bsr $t0,picmeup
nop
.Louter:
extql $inhi,$inp,$inhi
extqh $Thi0,$inp,$Thi0
or $inhi,$Thi0,$inhi
lda $inp,16($inp)
extql $inlo,$inp,$inlo
extqh $Tlo0,$inp,$Tlo0
or $inlo,$Tlo0,$inlo
subq $len,16,$len
xor $Xlo,$inlo,$Xlo
xor $Xhi,$inhi,$Xhi
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
beq $len,.Ldone
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
ldq_u $inhi,0($inp)
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
ldq_u $Thi0,7($inp)
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
br zero,.Louter
.Ldone:
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
.set noreorder
/*ldq ra,0(sp)*/
ldq s0,8(sp)
ldq s1,16(sp)
lda sp,32(sp)
ret (ra)
.end gcm_ghash_4bit
.align 4
.ent picmeup
picmeup:
.frame sp,0,$t0
.prologue 0
br $rem_4bit,.Lpic
.Lpic: lda $rem_4bit,12($rem_4bit)
ret ($t0)
.end picmeup
nop
rem_4bit:
.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;