|  | #!/usr/bin/env perl | 
|  | # | 
|  | # ==================================================================== | 
|  | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 
|  | # project. The module is, however, dual licensed under OpenSSL and | 
|  | # CRYPTOGAMS licenses depending on where you obtain it. For further | 
|  | # details see http://www.openssl.org/~appro/cryptogams/. | 
|  | # ==================================================================== | 
|  | # | 
|  | # February 2009 | 
|  | # | 
|  | # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to | 
|  | # "cluster" Address Generation Interlocks, so that one pipeline stall | 
|  | # resolves several dependencies. | 
|  |  | 
|  | # November 2010. | 
|  | # | 
|  | # Adapt for -m31 build. If kernel supports what's called "highgprs" | 
|  | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | 
|  | # instructions and achieve "64-bit" performance even in 31-bit legacy | 
|  | # application context. The feature is not specific to any particular | 
|  | # processor, as long as it's "z-CPU". Latter implies that the code | 
|  | # remains z/Architecture specific. On z990 it was measured to perform | 
|  | # 50% better than code generated by gcc 4.3. | 
|  |  | 
|  | $flavour = shift; | 
|  |  | 
|  | if ($flavour =~ /3[12]/) { | 
|  | $SIZE_T=4; | 
|  | $g=""; | 
|  | } else { | 
|  | $SIZE_T=8; | 
|  | $g="g"; | 
|  | } | 
|  |  | 
|  | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 
|  | open STDOUT,">$output"; | 
|  |  | 
|  | $rp="%r14"; | 
|  | $sp="%r15"; | 
|  | $code=<<___; | 
|  | .text | 
|  |  | 
|  | ___ | 
|  |  | 
|  | # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) | 
|  | { | 
|  | $acc="%r0"; | 
|  | $cnt="%r1"; | 
|  | $key="%r2"; | 
|  | $len="%r3"; | 
|  | $inp="%r4"; | 
|  | $out="%r5"; | 
|  |  | 
|  | @XX=("%r6","%r7"); | 
|  | @TX=("%r8","%r9"); | 
|  | $YY="%r10"; | 
|  | $TY="%r11"; | 
|  |  | 
|  | $code.=<<___; | 
|  | .globl	RC4 | 
|  | .type	RC4,\@function | 
|  | .align	64 | 
|  | RC4: | 
|  | stm${g}	%r6,%r11,6*$SIZE_T($sp) | 
|  | ___ | 
|  | $code.=<<___ if ($flavour =~ /3[12]/); | 
|  | llgfr	$len,$len | 
|  | ___ | 
|  | $code.=<<___; | 
|  | llgc	$XX[0],0($key) | 
|  | llgc	$YY,1($key) | 
|  | la	$XX[0],1($XX[0]) | 
|  | nill	$XX[0],0xff | 
|  | srlg	$cnt,$len,3 | 
|  | ltgr	$cnt,$cnt | 
|  | llgc	$TX[0],2($XX[0],$key) | 
|  | jz	.Lshort | 
|  | j	.Loop8 | 
|  |  | 
|  | .align	64 | 
|  | .Loop8: | 
|  | ___ | 
|  | for ($i=0;$i<8;$i++) { | 
|  | $code.=<<___; | 
|  | la	$YY,0($YY,$TX[0])	# $i | 
|  | nill	$YY,255 | 
|  | la	$XX[1],1($XX[0]) | 
|  | nill	$XX[1],255 | 
|  | ___ | 
|  | $code.=<<___ if ($i==1); | 
|  | llgc	$acc,2($TY,$key) | 
|  | ___ | 
|  | $code.=<<___ if ($i>1); | 
|  | sllg	$acc,$acc,8 | 
|  | ic	$acc,2($TY,$key) | 
|  | ___ | 
|  | $code.=<<___; | 
|  | llgc	$TY,2($YY,$key) | 
|  | stc	$TX[0],2($YY,$key) | 
|  | llgc	$TX[1],2($XX[1],$key) | 
|  | stc	$TY,2($XX[0],$key) | 
|  | cr	$XX[1],$YY | 
|  | jne	.Lcmov$i | 
|  | la	$TX[1],0($TX[0]) | 
|  | .Lcmov$i: | 
|  | la	$TY,0($TY,$TX[0]) | 
|  | nill	$TY,255 | 
|  | ___ | 
|  | push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers | 
|  | } | 
|  |  | 
|  | $code.=<<___; | 
|  | lg	$TX[1],0($inp) | 
|  | sllg	$acc,$acc,8 | 
|  | la	$inp,8($inp) | 
|  | ic	$acc,2($TY,$key) | 
|  | xgr	$acc,$TX[1] | 
|  | stg	$acc,0($out) | 
|  | la	$out,8($out) | 
|  | brctg	$cnt,.Loop8 | 
|  |  | 
|  | .Lshort: | 
|  | lghi	$acc,7 | 
|  | ngr	$len,$acc | 
|  | jz	.Lexit | 
|  | j	.Loop1 | 
|  |  | 
|  | .align	16 | 
|  | .Loop1: | 
|  | la	$YY,0($YY,$TX[0]) | 
|  | nill	$YY,255 | 
|  | llgc	$TY,2($YY,$key) | 
|  | stc	$TX[0],2($YY,$key) | 
|  | stc	$TY,2($XX[0],$key) | 
|  | ar	$TY,$TX[0] | 
|  | ahi	$XX[0],1 | 
|  | nill	$TY,255 | 
|  | nill	$XX[0],255 | 
|  | llgc	$acc,0($inp) | 
|  | la	$inp,1($inp) | 
|  | llgc	$TY,2($TY,$key) | 
|  | llgc	$TX[0],2($XX[0],$key) | 
|  | xr	$acc,$TY | 
|  | stc	$acc,0($out) | 
|  | la	$out,1($out) | 
|  | brct	$len,.Loop1 | 
|  |  | 
|  | .Lexit: | 
|  | ahi	$XX[0],-1 | 
|  | stc	$XX[0],0($key) | 
|  | stc	$YY,1($key) | 
|  | lm${g}	%r6,%r11,6*$SIZE_T($sp) | 
|  | br	$rp | 
|  | .size	RC4,.-RC4 | 
|  | .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 
|  |  | 
|  | ___ | 
|  | } | 
|  |  | 
|  | # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) | 
|  | { | 
|  | $cnt="%r0"; | 
|  | $idx="%r1"; | 
|  | $key="%r2"; | 
|  | $len="%r3"; | 
|  | $inp="%r4"; | 
|  | $acc="%r5"; | 
|  | $dat="%r6"; | 
|  | $ikey="%r7"; | 
|  | $iinp="%r8"; | 
|  |  | 
|  | $code.=<<___; | 
|  | .globl	RC4_set_key | 
|  | .type	RC4_set_key,\@function | 
|  | .align	64 | 
|  | RC4_set_key: | 
|  | stm${g}	%r6,%r8,6*$SIZE_T($sp) | 
|  | lhi	$cnt,256 | 
|  | la	$idx,0(%r0) | 
|  | sth	$idx,0($key) | 
|  | .align	4 | 
|  | .L1stloop: | 
|  | stc	$idx,2($idx,$key) | 
|  | la	$idx,1($idx) | 
|  | brct	$cnt,.L1stloop | 
|  |  | 
|  | lghi	$ikey,-256 | 
|  | lr	$cnt,$len | 
|  | la	$iinp,0(%r0) | 
|  | la	$idx,0(%r0) | 
|  | .align	16 | 
|  | .L2ndloop: | 
|  | llgc	$acc,2+256($ikey,$key) | 
|  | llgc	$dat,0($iinp,$inp) | 
|  | la	$idx,0($idx,$acc) | 
|  | la	$ikey,1($ikey) | 
|  | la	$idx,0($idx,$dat) | 
|  | nill	$idx,255 | 
|  | la	$iinp,1($iinp) | 
|  | tml	$ikey,255 | 
|  | llgc	$dat,2($idx,$key) | 
|  | stc	$dat,2+256-1($ikey,$key) | 
|  | stc	$acc,2($idx,$key) | 
|  | jz	.Ldone | 
|  | brct	$cnt,.L2ndloop | 
|  | lr	$cnt,$len | 
|  | la	$iinp,0(%r0) | 
|  | j	.L2ndloop | 
|  | .Ldone: | 
|  | lm${g}	%r6,%r8,6*$SIZE_T($sp) | 
|  | br	$rp | 
|  | .size	RC4_set_key,.-RC4_set_key | 
|  |  | 
|  | ___ | 
|  | } | 
|  |  | 
|  | # const char *RC4_options() | 
|  | $code.=<<___; | 
|  | .globl	RC4_options | 
|  | .type	RC4_options,\@function | 
|  | .align	16 | 
|  | RC4_options: | 
|  | larl	%r2,.Loptions | 
|  | br	%r14 | 
|  | .size	RC4_options,.-RC4_options | 
|  | .section	.rodata | 
|  | .Loptions: | 
|  | .align	8 | 
|  | .string	"rc4(8x,char)" | 
|  | ___ | 
|  |  | 
|  | print $code; | 
|  | close STDOUT;	# force flush |