|  | #! /usr/bin/env perl | 
|  | # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. | 
|  | # | 
|  | # Licensed under the OpenSSL license (the "License").  You may not use | 
|  | # this file except in compliance with the License.  You can obtain a copy | 
|  | # in the file LICENSE in the source distribution or at | 
|  | # https://www.openssl.org/source/license.html | 
|  |  | 
|  | # | 
|  | # ==================================================================== | 
|  | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | 
|  | # project. The module is, however, dual licensed under OpenSSL and | 
|  | # CRYPTOGAMS licenses depending on where you obtain it. For further | 
|  | # details see http://www.openssl.org/~appro/cryptogams/. | 
|  | # ==================================================================== | 
|  | # | 
|  | # March 2010 | 
|  | # | 
|  | # The module implements "4-bit" GCM GHASH function and underlying | 
|  | # single multiplication operation in GF(2^128). "4-bit" means that it | 
|  | # uses 256 bytes per-key table [+128 bytes shared table]. Even though | 
|  | # loops are aggressively modulo-scheduled in respect to references to | 
|  | # Htbl and Z.hi updates for 8 cycles per byte, measured performance is | 
|  | # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic | 
|  | # scheduling "glitch," because uprofile(1) indicates uniform sample | 
|  | # distribution, as if all instruction bundles execute in 1.5 cycles. | 
|  | # Meaning that it could have been even faster, yet 12 cycles is ~60% | 
|  | # better than gcc-generated code and ~80% than code generated by vendor | 
|  | # compiler. | 
|  |  | 
|  | $cnt="v0";	# $0 | 
|  | $t0="t0"; | 
|  | $t1="t1"; | 
|  | $t2="t2"; | 
|  | $Thi0="t3";	# $4 | 
|  | $Tlo0="t4"; | 
|  | $Thi1="t5"; | 
|  | $Tlo1="t6"; | 
|  | $rem="t7";	# $8 | 
|  | ################# | 
|  | $Xi="a0";	# $16, input argument block | 
|  | $Htbl="a1"; | 
|  | $inp="a2"; | 
|  | $len="a3"; | 
|  | $nlo="a4";	# $20 | 
|  | $nhi="a5"; | 
|  | $Zhi="t8"; | 
|  | $Zlo="t9"; | 
|  | $Xhi="t10";	# $24 | 
|  | $Xlo="t11"; | 
|  | $remp="t12"; | 
|  | $rem_4bit="AT";	# $28 | 
|  |  | 
|  | { my $N; | 
|  | sub loop() { | 
|  |  | 
|  | $N++; | 
|  | $code.=<<___; | 
|  | .align	4 | 
|  | extbl	$Xlo,7,$nlo | 
|  | and	$nlo,0xf0,$nhi | 
|  | sll	$nlo,4,$nlo | 
|  | and	$nlo,0xf0,$nlo | 
|  |  | 
|  | addq	$nlo,$Htbl,$nlo | 
|  | ldq	$Zlo,8($nlo) | 
|  | addq	$nhi,$Htbl,$nhi | 
|  | ldq	$Zhi,0($nlo) | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | lda	$cnt,6(zero) | 
|  | extbl	$Xlo,6,$nlo | 
|  |  | 
|  | ldq	$Tlo1,8($nhi) | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  | ldq	$Thi1,0($nhi) | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  | and	$nlo,0xf0,$nhi | 
|  |  | 
|  | xor	$Tlo1,$Zlo,$Zlo | 
|  | sll	$nlo,4,$nlo | 
|  | xor	$Thi1,$Zhi,$Zhi | 
|  | and	$nlo,0xf0,$nlo | 
|  |  | 
|  | addq	$nlo,$Htbl,$nlo | 
|  | ldq	$Tlo0,8($nlo) | 
|  | addq	$nhi,$Htbl,$nhi | 
|  | ldq	$Thi0,0($nlo) | 
|  |  | 
|  | .Looplo$N: | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | subq	$cnt,1,$cnt | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | ldq	$Tlo1,8($nhi) | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | ldq	$Thi1,0($nhi) | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  | extbl	$Xlo,$cnt,$nlo | 
|  |  | 
|  | and	$nlo,0xf0,$nhi | 
|  | xor	$Thi0,$Zhi,$Zhi | 
|  | xor	$Tlo0,$Zlo,$Zlo | 
|  | sll	$nlo,4,$nlo | 
|  |  | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | and	$nlo,0xf0,$nlo | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | addq	$nlo,$Htbl,$nlo | 
|  | addq	$nhi,$Htbl,$nhi | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | ldq	$Tlo0,8($nlo) | 
|  | xor	$t0,$Zlo,$Zlo | 
|  |  | 
|  | xor	$Tlo1,$Zlo,$Zlo | 
|  | xor	$Thi1,$Zhi,$Zhi | 
|  | ldq	$Thi0,0($nlo) | 
|  | bne	$cnt,.Looplo$N | 
|  |  | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | lda	$cnt,7(zero) | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | ldq	$Tlo1,8($nhi) | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | ldq	$Thi1,0($nhi) | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  | extbl	$Xhi,$cnt,$nlo | 
|  |  | 
|  | and	$nlo,0xf0,$nhi | 
|  | xor	$Thi0,$Zhi,$Zhi | 
|  | xor	$Tlo0,$Zlo,$Zlo | 
|  | sll	$nlo,4,$nlo | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | and	$nlo,0xf0,$nlo | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | addq	$nlo,$Htbl,$nlo | 
|  | addq	$nhi,$Htbl,$nhi | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | ldq	$Tlo0,8($nlo) | 
|  | xor	$t0,$Zlo,$Zlo | 
|  |  | 
|  | xor	$Tlo1,$Zlo,$Zlo | 
|  | xor	$Thi1,$Zhi,$Zhi | 
|  | ldq	$Thi0,0($nlo) | 
|  | unop | 
|  |  | 
|  |  | 
|  | .Loophi$N: | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | subq	$cnt,1,$cnt | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | ldq	$Tlo1,8($nhi) | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | ldq	$Thi1,0($nhi) | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  | extbl	$Xhi,$cnt,$nlo | 
|  |  | 
|  | and	$nlo,0xf0,$nhi | 
|  | xor	$Thi0,$Zhi,$Zhi | 
|  | xor	$Tlo0,$Zlo,$Zlo | 
|  | sll	$nlo,4,$nlo | 
|  |  | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | and	$nlo,0xf0,$nlo | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | addq	$nlo,$Htbl,$nlo | 
|  | addq	$nhi,$Htbl,$nhi | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | ldq	$Tlo0,8($nlo) | 
|  | xor	$t0,$Zlo,$Zlo | 
|  |  | 
|  | xor	$Tlo1,$Zlo,$Zlo | 
|  | xor	$Thi1,$Zhi,$Zhi | 
|  | ldq	$Thi0,0($nlo) | 
|  | bne	$cnt,.Loophi$N | 
|  |  | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | ldq	$Tlo1,8($nhi) | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | ldq	$Thi1,0($nhi) | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  |  | 
|  | xor	$Tlo0,$Zlo,$Zlo | 
|  | xor	$Thi0,$Zhi,$Zhi | 
|  |  | 
|  | and	$Zlo,0x0f,$remp | 
|  | sll	$Zhi,60,$t0 | 
|  | srl	$Zlo,4,$Zlo | 
|  |  | 
|  | s8addq	$remp,$rem_4bit,$remp | 
|  | xor	$rem,$Zhi,$Zhi | 
|  |  | 
|  | ldq	$rem,0($remp) | 
|  | srl	$Zhi,4,$Zhi | 
|  | xor	$Tlo1,$Zlo,$Zlo | 
|  | xor	$Thi1,$Zhi,$Zhi | 
|  | xor	$t0,$Zlo,$Zlo | 
|  | xor	$rem,$Zhi,$Zhi | 
|  | ___ | 
|  | }} | 
|  |  | 
|  | $code=<<___; | 
|  | #ifdef __linux__ | 
|  | #include <asm/regdef.h> | 
|  | #else | 
|  | #include <asm.h> | 
|  | #include <regdef.h> | 
|  | #endif | 
|  |  | 
|  | .text | 
|  |  | 
|  | .set	noat | 
|  | .set	noreorder | 
|  | .globl	gcm_gmult_4bit | 
|  | .align	4 | 
|  | .ent	gcm_gmult_4bit | 
|  | gcm_gmult_4bit: | 
|  | .frame	sp,0,ra | 
|  | .prologue 0 | 
|  |  | 
|  | ldq	$Xlo,8($Xi) | 
|  | ldq	$Xhi,0($Xi) | 
|  |  | 
|  | bsr	$t0,picmeup | 
|  | nop | 
|  | ___ | 
|  |  | 
|  | &loop(); | 
|  |  | 
|  | $code.=<<___; | 
|  | srl	$Zlo,24,$t0	# byte swap | 
|  | srl	$Zlo,8,$t1 | 
|  |  | 
|  | sll	$Zlo,8,$t2 | 
|  | sll	$Zlo,24,$Zlo | 
|  | zapnot	$t0,0x11,$t0 | 
|  | zapnot	$t1,0x22,$t1 | 
|  |  | 
|  | zapnot	$Zlo,0x88,$Zlo | 
|  | or	$t0,$t1,$t0 | 
|  | zapnot	$t2,0x44,$t2 | 
|  |  | 
|  | or	$Zlo,$t0,$Zlo | 
|  | srl	$Zhi,24,$t0 | 
|  | srl	$Zhi,8,$t1 | 
|  |  | 
|  | or	$Zlo,$t2,$Zlo | 
|  | sll	$Zhi,8,$t2 | 
|  | sll	$Zhi,24,$Zhi | 
|  |  | 
|  | srl	$Zlo,32,$Xlo | 
|  | sll	$Zlo,32,$Zlo | 
|  |  | 
|  | zapnot	$t0,0x11,$t0 | 
|  | zapnot	$t1,0x22,$t1 | 
|  | or	$Zlo,$Xlo,$Xlo | 
|  |  | 
|  | zapnot	$Zhi,0x88,$Zhi | 
|  | or	$t0,$t1,$t0 | 
|  | zapnot	$t2,0x44,$t2 | 
|  |  | 
|  | or	$Zhi,$t0,$Zhi | 
|  | or	$Zhi,$t2,$Zhi | 
|  |  | 
|  | srl	$Zhi,32,$Xhi | 
|  | sll	$Zhi,32,$Zhi | 
|  |  | 
|  | or	$Zhi,$Xhi,$Xhi | 
|  | stq	$Xlo,8($Xi) | 
|  | stq	$Xhi,0($Xi) | 
|  |  | 
|  | ret	(ra) | 
|  | .end	gcm_gmult_4bit | 
|  | ___ | 
|  |  | 
|  | $inhi="s0"; | 
|  | $inlo="s1"; | 
|  |  | 
|  | $code.=<<___; | 
|  | .globl	gcm_ghash_4bit | 
|  | .align	4 | 
|  | .ent	gcm_ghash_4bit | 
|  | gcm_ghash_4bit: | 
|  | lda	sp,-32(sp) | 
|  | stq	ra,0(sp) | 
|  | stq	s0,8(sp) | 
|  | stq	s1,16(sp) | 
|  | .mask	0x04000600,-32 | 
|  | .frame	sp,32,ra | 
|  | .prologue 0 | 
|  |  | 
|  | ldq_u	$inhi,0($inp) | 
|  | ldq_u	$Thi0,7($inp) | 
|  | ldq_u	$inlo,8($inp) | 
|  | ldq_u	$Tlo0,15($inp) | 
|  | ldq	$Xhi,0($Xi) | 
|  | ldq	$Xlo,8($Xi) | 
|  |  | 
|  | bsr	$t0,picmeup | 
|  | nop | 
|  |  | 
|  | .Louter: | 
|  | extql	$inhi,$inp,$inhi | 
|  | extqh	$Thi0,$inp,$Thi0 | 
|  | or	$inhi,$Thi0,$inhi | 
|  | lda	$inp,16($inp) | 
|  |  | 
|  | extql	$inlo,$inp,$inlo | 
|  | extqh	$Tlo0,$inp,$Tlo0 | 
|  | or	$inlo,$Tlo0,$inlo | 
|  | subq	$len,16,$len | 
|  |  | 
|  | xor	$Xlo,$inlo,$Xlo | 
|  | xor	$Xhi,$inhi,$Xhi | 
|  | ___ | 
|  |  | 
|  | &loop(); | 
|  |  | 
|  | $code.=<<___; | 
|  | srl	$Zlo,24,$t0	# byte swap | 
|  | srl	$Zlo,8,$t1 | 
|  |  | 
|  | sll	$Zlo,8,$t2 | 
|  | sll	$Zlo,24,$Zlo | 
|  | zapnot	$t0,0x11,$t0 | 
|  | zapnot	$t1,0x22,$t1 | 
|  |  | 
|  | zapnot	$Zlo,0x88,$Zlo | 
|  | or	$t0,$t1,$t0 | 
|  | zapnot	$t2,0x44,$t2 | 
|  |  | 
|  | or	$Zlo,$t0,$Zlo | 
|  | srl	$Zhi,24,$t0 | 
|  | srl	$Zhi,8,$t1 | 
|  |  | 
|  | or	$Zlo,$t2,$Zlo | 
|  | sll	$Zhi,8,$t2 | 
|  | sll	$Zhi,24,$Zhi | 
|  |  | 
|  | srl	$Zlo,32,$Xlo | 
|  | sll	$Zlo,32,$Zlo | 
|  | beq	$len,.Ldone | 
|  |  | 
|  | zapnot	$t0,0x11,$t0 | 
|  | zapnot	$t1,0x22,$t1 | 
|  | or	$Zlo,$Xlo,$Xlo | 
|  | ldq_u	$inhi,0($inp) | 
|  |  | 
|  | zapnot	$Zhi,0x88,$Zhi | 
|  | or	$t0,$t1,$t0 | 
|  | zapnot	$t2,0x44,$t2 | 
|  | ldq_u	$Thi0,7($inp) | 
|  |  | 
|  | or	$Zhi,$t0,$Zhi | 
|  | or	$Zhi,$t2,$Zhi | 
|  | ldq_u	$inlo,8($inp) | 
|  | ldq_u	$Tlo0,15($inp) | 
|  |  | 
|  | srl	$Zhi,32,$Xhi | 
|  | sll	$Zhi,32,$Zhi | 
|  |  | 
|  | or	$Zhi,$Xhi,$Xhi | 
|  | br	zero,.Louter | 
|  |  | 
|  | .Ldone: | 
|  | zapnot	$t0,0x11,$t0 | 
|  | zapnot	$t1,0x22,$t1 | 
|  | or	$Zlo,$Xlo,$Xlo | 
|  |  | 
|  | zapnot	$Zhi,0x88,$Zhi | 
|  | or	$t0,$t1,$t0 | 
|  | zapnot	$t2,0x44,$t2 | 
|  |  | 
|  | or	$Zhi,$t0,$Zhi | 
|  | or	$Zhi,$t2,$Zhi | 
|  |  | 
|  | srl	$Zhi,32,$Xhi | 
|  | sll	$Zhi,32,$Zhi | 
|  |  | 
|  | or	$Zhi,$Xhi,$Xhi | 
|  |  | 
|  | stq	$Xlo,8($Xi) | 
|  | stq	$Xhi,0($Xi) | 
|  |  | 
|  | .set	noreorder | 
|  | /*ldq	ra,0(sp)*/ | 
|  | ldq	s0,8(sp) | 
|  | ldq	s1,16(sp) | 
|  | lda	sp,32(sp) | 
|  | ret	(ra) | 
|  | .end	gcm_ghash_4bit | 
|  |  | 
|  | .align	4 | 
|  | .ent	picmeup | 
|  | picmeup: | 
|  | .frame	sp,0,$t0 | 
|  | .prologue 0 | 
|  | br	$rem_4bit,.Lpic | 
|  | .Lpic:	lda	$rem_4bit,12($rem_4bit) | 
|  | ret	($t0) | 
|  | .end	picmeup | 
|  | nop | 
|  | rem_4bit: | 
|  | .long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 | 
|  | .long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 | 
|  | .long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 | 
|  | .long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 | 
|  | .ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | 
|  | .align	4 | 
|  |  | 
|  | ___ | 
|  | $output=pop and open STDOUT,">$output"; | 
|  | print $code; | 
|  | close STDOUT; | 
|  |  |