You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
314 lines
6.6 KiB
314 lines
6.6 KiB
#!/usr/bin/env perl |
|
|
|
# ==================================================================== |
|
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
|
# project. The module is, however, dual licensed under OpenSSL and |
|
# CRYPTOGAMS licenses depending on where you obtain it. For further |
|
# details see http://www.openssl.org/~appro/cryptogams/. |
|
# ==================================================================== |
|
|
|
# RC4 for PA-RISC. |
|
|
|
# June 2009. |
|
# |
|
# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. |
|
# For reference, [4x] unrolled loop is >40% faster than folded one. |
|
# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement |
|
# is believed to be not sufficient to justify the effort... |
|
# |
|
# Special thanks to polarhome.com for providing HP-UX account. |
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
|
|
$flavour = shift; |
|
$output = shift; |
|
open STDOUT,">$output"; |
|
|
|
if ($flavour =~ /64/) { |
|
$LEVEL ="2.0W"; |
|
$SIZE_T =8; |
|
$FRAME_MARKER =80; |
|
$SAVED_RP =16; |
|
$PUSH ="std"; |
|
$PUSHMA ="std,ma"; |
|
$POP ="ldd"; |
|
$POPMB ="ldd,mb"; |
|
} else { |
|
$LEVEL ="1.0"; |
|
$SIZE_T =4; |
|
$FRAME_MARKER =48; |
|
$SAVED_RP =20; |
|
$PUSH ="stw"; |
|
$PUSHMA ="stwm"; |
|
$POP ="ldw"; |
|
$POPMB ="ldwm"; |
|
} |
|
|
|
$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker |
|
# [+ argument transfer] |
|
$SZ=1; # defaults to RC4_CHAR |
|
if (open CONF,"<${dir}../../opensslconf.h") { |
|
while(<CONF>) { |
|
if (m/#\s*define\s+RC4_INT\s+(.*)/) { |
|
$SZ = ($1=~/char$/) ? 1 : 4; |
|
last; |
|
} |
|
} |
|
close CONF; |
|
} |
|
|
|
if ($SZ==1) { # RC4_CHAR |
|
$LD="ldb"; |
|
$LDX="ldbx"; |
|
$MKX="addl"; |
|
$ST="stb"; |
|
} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) |
|
$LD="ldw"; |
|
$LDX="ldwx,s"; |
|
$MKX="sh2addl"; |
|
$ST="stw"; |
|
} |
|
|
|
$key="%r26"; |
|
$len="%r25"; |
|
$inp="%r24"; |
|
$out="%r23"; |
|
|
|
@XX=("%r19","%r20"); |
|
@TX=("%r21","%r22"); |
|
$YY="%r28"; |
|
$TY="%r29"; |
|
|
|
$acc="%r1"; |
|
$ix="%r2"; |
|
$iy="%r3"; |
|
$dat0="%r4"; |
|
$dat1="%r5"; |
|
$rem="%r6"; |
|
$mask="%r31"; |
|
|
|
sub unrolledloopbody { |
|
for ($i=0;$i<4;$i++) { |
|
$code.=<<___; |
|
ldo 1($XX[0]),$XX[1] |
|
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` |
|
and $mask,$XX[1],$XX[1] |
|
$LDX $YY($key),$TY |
|
$MKX $YY,$key,$ix |
|
$LDX $XX[1]($key),$TX[1] |
|
$MKX $XX[0],$key,$iy |
|
$ST $TX[0],0($ix) |
|
comclr,<> $XX[1],$YY,%r0 ; conditional |
|
copy $TX[0],$TX[1] ; move |
|
`sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` |
|
$ST $TY,0($iy) |
|
addl $TX[0],$TY,$TY |
|
addl $TX[1],$YY,$YY |
|
and $mask,$TY,$TY |
|
and $mask,$YY,$YY |
|
___ |
|
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers |
|
} } |
|
|
|
sub foldedloop { |
|
my ($label,$count)=@_; |
|
$code.=<<___; |
|
$label |
|
$MKX $YY,$key,$iy |
|
$LDX $YY($key),$TY |
|
$MKX $XX[0],$key,$ix |
|
$ST $TX[0],0($iy) |
|
ldo 1($XX[0]),$XX[0] |
|
$ST $TY,0($ix) |
|
addl $TX[0],$TY,$TY |
|
ldbx $inp($out),$dat1 |
|
and $mask,$TY,$TY |
|
and $mask,$XX[0],$XX[0] |
|
$LDX $TY($key),$acc |
|
$LDX $XX[0]($key),$TX[0] |
|
ldo 1($out),$out |
|
xor $dat1,$acc,$acc |
|
addl $TX[0],$YY,$YY |
|
stb $acc,-1($out) |
|
addib,<> -1,$count,$label ; $count is always small |
|
and $mask,$YY,$YY |
|
___ |
|
} |
|
|
|
$code=<<___; |
|
.LEVEL $LEVEL |
|
.SPACE \$TEXT\$ |
|
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY |
|
|
|
.EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR |
|
RC4 |
|
.PROC |
|
.CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 |
|
.ENTRY |
|
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue |
|
$PUSHMA %r3,$FRAME(%sp) |
|
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) |
|
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) |
|
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) |
|
|
|
cmpib,*= 0,$len,L\$abort |
|
sub $inp,$out,$inp ; distance between $inp and $out |
|
|
|
$LD `0*$SZ`($key),$XX[0] |
|
$LD `1*$SZ`($key),$YY |
|
ldo `2*$SZ`($key),$key |
|
|
|
ldi 0xff,$mask |
|
ldi 3,$dat0 |
|
|
|
ldo 1($XX[0]),$XX[0] ; warm up loop |
|
and $mask,$XX[0],$XX[0] |
|
$LDX $XX[0]($key),$TX[0] |
|
addl $TX[0],$YY,$YY |
|
cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? |
|
and $mask,$YY,$YY |
|
|
|
and,<> $out,$dat0,$rem ; is $out aligned? |
|
b L\$alignedout |
|
subi 4,$rem,$rem |
|
sub $len,$rem,$len |
|
___ |
|
&foldedloop("L\$alignout",$rem); # process till $out is aligned |
|
|
|
$code.=<<___; |
|
L\$alignedout ; $len is at least 4 here |
|
and,<> $inp,$dat0,$acc ; is $inp aligned? |
|
b L\$oop4 |
|
sub $inp,$acc,$rem ; align $inp |
|
|
|
sh3addl $acc,%r0,$acc |
|
subi 32,$acc,$acc |
|
mtctl $acc,%cr11 ; load %sar with vshd align factor |
|
ldwx $rem($out),$dat0 |
|
ldo 4($rem),$rem |
|
L\$oop4misalignedinp |
|
___ |
|
&unrolledloopbody(); |
|
$code.=<<___; |
|
$LDX $TY($key),$ix |
|
ldwx $rem($out),$dat1 |
|
ldo -4($len),$len |
|
or $ix,$acc,$acc ; last piece, no need to dep |
|
vshd $dat0,$dat1,$iy ; align data |
|
copy $dat1,$dat0 |
|
xor $iy,$acc,$acc |
|
stw $acc,0($out) |
|
cmpib,*<< 3,$len,L\$oop4misalignedinp |
|
ldo 4($out),$out |
|
cmpib,*= 0,$len,L\$done |
|
nop |
|
b L\$oop1 |
|
nop |
|
|
|
.ALIGN 8 |
|
L\$oop4 |
|
___ |
|
&unrolledloopbody(); |
|
$code.=<<___; |
|
$LDX $TY($key),$ix |
|
ldwx $inp($out),$dat0 |
|
ldo -4($len),$len |
|
or $ix,$acc,$acc ; last piece, no need to dep |
|
xor $dat0,$acc,$acc |
|
stw $acc,0($out) |
|
cmpib,*<< 3,$len,L\$oop4 |
|
ldo 4($out),$out |
|
cmpib,*= 0,$len,L\$done |
|
nop |
|
___ |
|
&foldedloop("L\$oop1",$len); |
|
$code.=<<___; |
|
L\$done |
|
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 |
|
ldo -1($XX[0]),$XX[0] ; chill out loop |
|
sub $YY,$TX[0],$YY |
|
and $mask,$XX[0],$XX[0] |
|
and $mask,$YY,$YY |
|
$ST $XX[0],`-2*$SZ`($key) |
|
$ST $YY,`-1*$SZ`($key) |
|
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4 |
|
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5 |
|
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6 |
|
L\$abort |
|
bv (%r2) |
|
.EXIT |
|
$POPMB -$FRAME(%sp),%r3 |
|
.PROCEND |
|
___ |
|
|
|
$code.=<<___; |
|
|
|
.EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR |
|
.ALIGN 8 |
|
private_RC4_set_key |
|
.PROC |
|
.CALLINFO NO_CALLS |
|
.ENTRY |
|
$ST %r0,`0*$SZ`($key) |
|
$ST %r0,`1*$SZ`($key) |
|
ldo `2*$SZ`($key),$key |
|
copy %r0,@XX[0] |
|
L\$1st |
|
$ST @XX[0],0($key) |
|
ldo 1(@XX[0]),@XX[0] |
|
bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 |
|
ldo $SZ($key),$key |
|
|
|
ldo `-256*$SZ`($key),$key ; rewind $key |
|
addl $len,$inp,$inp ; $inp to point at the end |
|
sub %r0,$len,%r23 ; inverse index |
|
copy %r0,@XX[0] |
|
copy %r0,@XX[1] |
|
ldi 0xff,$mask |
|
|
|
L\$2nd |
|
$LDX @XX[0]($key),@TX[0] |
|
ldbx %r23($inp),@TX[1] |
|
addi,nuv 1,%r23,%r23 ; increment and conditional |
|
sub %r0,$len,%r23 ; inverse index |
|
addl @TX[0],@XX[1],@XX[1] |
|
addl @TX[1],@XX[1],@XX[1] |
|
and $mask,@XX[1],@XX[1] |
|
$MKX @XX[0],$key,$TY |
|
$LDX @XX[1]($key),@TX[1] |
|
$MKX @XX[1],$key,$YY |
|
ldo 1(@XX[0]),@XX[0] |
|
$ST @TX[0],0($YY) |
|
bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 |
|
$ST @TX[1],0($TY) |
|
|
|
bv,n (%r2) |
|
.EXIT |
|
nop |
|
.PROCEND |
|
|
|
.EXPORT RC4_options,ENTRY |
|
.ALIGN 8 |
|
RC4_options |
|
.PROC |
|
.CALLINFO NO_CALLS |
|
.ENTRY |
|
blr %r0,%r28 |
|
ldi 3,%r1 |
|
L\$pic |
|
andcm %r28,%r1,%r28 |
|
bv (%r2) |
|
.EXIT |
|
ldo L\$opts-L\$pic(%r28),%r28 |
|
.PROCEND |
|
.ALIGN 8 |
|
L\$opts |
|
.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" |
|
.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" |
|
___ |
|
$code =~ s/\`([^\`]*)\`/eval $1/gem; |
|
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); |
|
$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8); |
|
|
|
print $code; |
|
close STDOUT;
|
|
|