You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
7.5 KiB
460 lines
7.5 KiB
#!/usr/bin/env perl |
|
# |
|
# ==================================================================== |
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
|
# project. The module is, however, dual licensed under OpenSSL and |
|
# CRYPTOGAMS licenses depending on where you obtain it. For further |
|
# details see http://www.openssl.org/~appro/cryptogams/. |
|
# ==================================================================== |
|
# |
|
# March 2010 |
|
# |
|
# The module implements "4-bit" GCM GHASH function and underlying |
|
# single multiplication operation in GF(2^128). "4-bit" means that it |
|
# uses 256 bytes per-key table [+128 bytes shared table]. Even though |
|
# loops are aggressively modulo-scheduled in respect to references to |
|
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is |
|
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic |
|
# scheduling "glitch," because uprofile(1) indicates uniform sample |
|
# distribution, as if all instruction bundles execute in 1.5 cycles. |
|
# Meaning that it could have been even faster, yet 12 cycles is ~60% |
|
# better than gcc-generated code and ~80% than code generated by vendor |
|
# compiler. |
|
|
|
$cnt="v0"; # $0 |
|
$t0="t0"; |
|
$t1="t1"; |
|
$t2="t2"; |
|
$Thi0="t3"; # $4 |
|
$Tlo0="t4"; |
|
$Thi1="t5"; |
|
$Tlo1="t6"; |
|
$rem="t7"; # $8 |
|
################# |
|
$Xi="a0"; # $16, input argument block |
|
$Htbl="a1"; |
|
$inp="a2"; |
|
$len="a3"; |
|
$nlo="a4"; # $20 |
|
$nhi="a5"; |
|
$Zhi="t8"; |
|
$Zlo="t9"; |
|
$Xhi="t10"; # $24 |
|
$Xlo="t11"; |
|
$remp="t12"; |
|
$rem_4bit="AT"; # $28 |
|
|
|
{ my $N; |
|
sub loop() { |
|
|
|
$N++; |
|
$code.=<<___; |
|
.align 4 |
|
extbl $Xlo,7,$nlo |
|
and $nlo,0xf0,$nhi |
|
sll $nlo,4,$nlo |
|
and $nlo,0xf0,$nlo |
|
|
|
addq $nlo,$Htbl,$nlo |
|
ldq $Zlo,8($nlo) |
|
addq $nhi,$Htbl,$nhi |
|
ldq $Zhi,0($nlo) |
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
lda $cnt,6(zero) |
|
extbl $Xlo,6,$nlo |
|
|
|
ldq $Tlo1,8($nhi) |
|
s8addq $remp,$rem_4bit,$remp |
|
ldq $Thi1,0($nhi) |
|
srl $Zlo,4,$Zlo |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
and $nlo,0xf0,$nhi |
|
|
|
xor $Tlo1,$Zlo,$Zlo |
|
sll $nlo,4,$nlo |
|
xor $Thi1,$Zhi,$Zhi |
|
and $nlo,0xf0,$nlo |
|
|
|
addq $nlo,$Htbl,$nlo |
|
ldq $Tlo0,8($nlo) |
|
addq $nhi,$Htbl,$nhi |
|
ldq $Thi0,0($nlo) |
|
|
|
.Looplo$N: |
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
subq $cnt,1,$cnt |
|
srl $Zlo,4,$Zlo |
|
|
|
ldq $Tlo1,8($nhi) |
|
xor $rem,$Zhi,$Zhi |
|
ldq $Thi1,0($nhi) |
|
s8addq $remp,$rem_4bit,$remp |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
extbl $Xlo,$cnt,$nlo |
|
|
|
and $nlo,0xf0,$nhi |
|
xor $Thi0,$Zhi,$Zhi |
|
xor $Tlo0,$Zlo,$Zlo |
|
sll $nlo,4,$nlo |
|
|
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
and $nlo,0xf0,$nlo |
|
srl $Zlo,4,$Zlo |
|
|
|
s8addq $remp,$rem_4bit,$remp |
|
xor $rem,$Zhi,$Zhi |
|
addq $nlo,$Htbl,$nlo |
|
addq $nhi,$Htbl,$nhi |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
ldq $Tlo0,8($nlo) |
|
xor $t0,$Zlo,$Zlo |
|
|
|
xor $Tlo1,$Zlo,$Zlo |
|
xor $Thi1,$Zhi,$Zhi |
|
ldq $Thi0,0($nlo) |
|
bne $cnt,.Looplo$N |
|
|
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
lda $cnt,7(zero) |
|
srl $Zlo,4,$Zlo |
|
|
|
ldq $Tlo1,8($nhi) |
|
xor $rem,$Zhi,$Zhi |
|
ldq $Thi1,0($nhi) |
|
s8addq $remp,$rem_4bit,$remp |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
extbl $Xhi,$cnt,$nlo |
|
|
|
and $nlo,0xf0,$nhi |
|
xor $Thi0,$Zhi,$Zhi |
|
xor $Tlo0,$Zlo,$Zlo |
|
sll $nlo,4,$nlo |
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
and $nlo,0xf0,$nlo |
|
srl $Zlo,4,$Zlo |
|
|
|
s8addq $remp,$rem_4bit,$remp |
|
xor $rem,$Zhi,$Zhi |
|
addq $nlo,$Htbl,$nlo |
|
addq $nhi,$Htbl,$nhi |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
ldq $Tlo0,8($nlo) |
|
xor $t0,$Zlo,$Zlo |
|
|
|
xor $Tlo1,$Zlo,$Zlo |
|
xor $Thi1,$Zhi,$Zhi |
|
ldq $Thi0,0($nlo) |
|
unop |
|
|
|
|
|
.Loophi$N: |
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
subq $cnt,1,$cnt |
|
srl $Zlo,4,$Zlo |
|
|
|
ldq $Tlo1,8($nhi) |
|
xor $rem,$Zhi,$Zhi |
|
ldq $Thi1,0($nhi) |
|
s8addq $remp,$rem_4bit,$remp |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
extbl $Xhi,$cnt,$nlo |
|
|
|
and $nlo,0xf0,$nhi |
|
xor $Thi0,$Zhi,$Zhi |
|
xor $Tlo0,$Zlo,$Zlo |
|
sll $nlo,4,$nlo |
|
|
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
and $nlo,0xf0,$nlo |
|
srl $Zlo,4,$Zlo |
|
|
|
s8addq $remp,$rem_4bit,$remp |
|
xor $rem,$Zhi,$Zhi |
|
addq $nlo,$Htbl,$nlo |
|
addq $nhi,$Htbl,$nhi |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
ldq $Tlo0,8($nlo) |
|
xor $t0,$Zlo,$Zlo |
|
|
|
xor $Tlo1,$Zlo,$Zlo |
|
xor $Thi1,$Zhi,$Zhi |
|
ldq $Thi0,0($nlo) |
|
bne $cnt,.Loophi$N |
|
|
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
srl $Zlo,4,$Zlo |
|
|
|
ldq $Tlo1,8($nhi) |
|
xor $rem,$Zhi,$Zhi |
|
ldq $Thi1,0($nhi) |
|
s8addq $remp,$rem_4bit,$remp |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
|
|
xor $Tlo0,$Zlo,$Zlo |
|
xor $Thi0,$Zhi,$Zhi |
|
|
|
and $Zlo,0x0f,$remp |
|
sll $Zhi,60,$t0 |
|
srl $Zlo,4,$Zlo |
|
|
|
s8addq $remp,$rem_4bit,$remp |
|
xor $rem,$Zhi,$Zhi |
|
|
|
ldq $rem,0($remp) |
|
srl $Zhi,4,$Zhi |
|
xor $Tlo1,$Zlo,$Zlo |
|
xor $Thi1,$Zhi,$Zhi |
|
xor $t0,$Zlo,$Zlo |
|
xor $rem,$Zhi,$Zhi |
|
___ |
|
}} |
|
|
|
$code=<<___; |
|
#ifdef __linux__ |
|
#include <asm/regdef.h> |
|
#else |
|
#include <asm.h> |
|
#include <regdef.h> |
|
#endif |
|
|
|
.text |
|
|
|
.set noat |
|
.set noreorder |
|
.globl gcm_gmult_4bit |
|
.align 4 |
|
.ent gcm_gmult_4bit |
|
gcm_gmult_4bit: |
|
.frame sp,0,ra |
|
.prologue 0 |
|
|
|
ldq $Xlo,8($Xi) |
|
ldq $Xhi,0($Xi) |
|
|
|
bsr $t0,picmeup |
|
nop |
|
___ |
|
|
|
&loop(); |
|
|
|
$code.=<<___; |
|
srl $Zlo,24,$t0 # byte swap |
|
srl $Zlo,8,$t1 |
|
|
|
sll $Zlo,8,$t2 |
|
sll $Zlo,24,$Zlo |
|
zapnot $t0,0x11,$t0 |
|
zapnot $t1,0x22,$t1 |
|
|
|
zapnot $Zlo,0x88,$Zlo |
|
or $t0,$t1,$t0 |
|
zapnot $t2,0x44,$t2 |
|
|
|
or $Zlo,$t0,$Zlo |
|
srl $Zhi,24,$t0 |
|
srl $Zhi,8,$t1 |
|
|
|
or $Zlo,$t2,$Zlo |
|
sll $Zhi,8,$t2 |
|
sll $Zhi,24,$Zhi |
|
|
|
srl $Zlo,32,$Xlo |
|
sll $Zlo,32,$Zlo |
|
|
|
zapnot $t0,0x11,$t0 |
|
zapnot $t1,0x22,$t1 |
|
or $Zlo,$Xlo,$Xlo |
|
|
|
zapnot $Zhi,0x88,$Zhi |
|
or $t0,$t1,$t0 |
|
zapnot $t2,0x44,$t2 |
|
|
|
or $Zhi,$t0,$Zhi |
|
or $Zhi,$t2,$Zhi |
|
|
|
srl $Zhi,32,$Xhi |
|
sll $Zhi,32,$Zhi |
|
|
|
or $Zhi,$Xhi,$Xhi |
|
stq $Xlo,8($Xi) |
|
stq $Xhi,0($Xi) |
|
|
|
ret (ra) |
|
.end gcm_gmult_4bit |
|
___ |
|
|
|
$inhi="s0"; |
|
$inlo="s1"; |
|
|
|
$code.=<<___; |
|
.globl gcm_ghash_4bit |
|
.align 4 |
|
.ent gcm_ghash_4bit |
|
gcm_ghash_4bit: |
|
lda sp,-32(sp) |
|
stq ra,0(sp) |
|
stq s0,8(sp) |
|
stq s1,16(sp) |
|
.mask 0x04000600,-32 |
|
.frame sp,32,ra |
|
.prologue 0 |
|
|
|
ldq_u $inhi,0($inp) |
|
ldq_u $Thi0,7($inp) |
|
ldq_u $inlo,8($inp) |
|
ldq_u $Tlo0,15($inp) |
|
ldq $Xhi,0($Xi) |
|
ldq $Xlo,8($Xi) |
|
|
|
bsr $t0,picmeup |
|
nop |
|
|
|
.Louter: |
|
extql $inhi,$inp,$inhi |
|
extqh $Thi0,$inp,$Thi0 |
|
or $inhi,$Thi0,$inhi |
|
lda $inp,16($inp) |
|
|
|
extql $inlo,$inp,$inlo |
|
extqh $Tlo0,$inp,$Tlo0 |
|
or $inlo,$Tlo0,$inlo |
|
subq $len,16,$len |
|
|
|
xor $Xlo,$inlo,$Xlo |
|
xor $Xhi,$inhi,$Xhi |
|
___ |
|
|
|
&loop(); |
|
|
|
$code.=<<___; |
|
srl $Zlo,24,$t0 # byte swap |
|
srl $Zlo,8,$t1 |
|
|
|
sll $Zlo,8,$t2 |
|
sll $Zlo,24,$Zlo |
|
zapnot $t0,0x11,$t0 |
|
zapnot $t1,0x22,$t1 |
|
|
|
zapnot $Zlo,0x88,$Zlo |
|
or $t0,$t1,$t0 |
|
zapnot $t2,0x44,$t2 |
|
|
|
or $Zlo,$t0,$Zlo |
|
srl $Zhi,24,$t0 |
|
srl $Zhi,8,$t1 |
|
|
|
or $Zlo,$t2,$Zlo |
|
sll $Zhi,8,$t2 |
|
sll $Zhi,24,$Zhi |
|
|
|
srl $Zlo,32,$Xlo |
|
sll $Zlo,32,$Zlo |
|
beq $len,.Ldone |
|
|
|
zapnot $t0,0x11,$t0 |
|
zapnot $t1,0x22,$t1 |
|
or $Zlo,$Xlo,$Xlo |
|
ldq_u $inhi,0($inp) |
|
|
|
zapnot $Zhi,0x88,$Zhi |
|
or $t0,$t1,$t0 |
|
zapnot $t2,0x44,$t2 |
|
ldq_u $Thi0,7($inp) |
|
|
|
or $Zhi,$t0,$Zhi |
|
or $Zhi,$t2,$Zhi |
|
ldq_u $inlo,8($inp) |
|
ldq_u $Tlo0,15($inp) |
|
|
|
srl $Zhi,32,$Xhi |
|
sll $Zhi,32,$Zhi |
|
|
|
or $Zhi,$Xhi,$Xhi |
|
br zero,.Louter |
|
|
|
.Ldone: |
|
zapnot $t0,0x11,$t0 |
|
zapnot $t1,0x22,$t1 |
|
or $Zlo,$Xlo,$Xlo |
|
|
|
zapnot $Zhi,0x88,$Zhi |
|
or $t0,$t1,$t0 |
|
zapnot $t2,0x44,$t2 |
|
|
|
or $Zhi,$t0,$Zhi |
|
or $Zhi,$t2,$Zhi |
|
|
|
srl $Zhi,32,$Xhi |
|
sll $Zhi,32,$Zhi |
|
|
|
or $Zhi,$Xhi,$Xhi |
|
|
|
stq $Xlo,8($Xi) |
|
stq $Xhi,0($Xi) |
|
|
|
.set noreorder |
|
/*ldq ra,0(sp)*/ |
|
ldq s0,8(sp) |
|
ldq s1,16(sp) |
|
lda sp,32(sp) |
|
ret (ra) |
|
.end gcm_ghash_4bit |
|
|
|
.align 4 |
|
.ent picmeup |
|
picmeup: |
|
.frame sp,0,$t0 |
|
.prologue 0 |
|
br $rem_4bit,.Lpic |
|
.Lpic: lda $rem_4bit,12($rem_4bit) |
|
ret ($t0) |
|
.end picmeup |
|
nop |
|
rem_4bit: |
|
.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 |
|
.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 |
|
.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 |
|
.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 |
|
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" |
|
.align 4 |
|
|
|
___ |
|
$output=shift and open STDOUT,">$output"; |
|
print $code; |
|
close STDOUT; |
|
|
|
|