| #! /usr/bin/env perl |
| # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # March 2010 |
| # |
| # The module implements "4-bit" GCM GHASH function and underlying |
| # single multiplication operation in GF(2^128). "4-bit" means that it |
| # uses 256 bytes per-key table [+128 bytes shared table]. Even though |
| # loops are aggressively modulo-scheduled in respect to references to |
| # Htbl and Z.hi updates for 8 cycles per byte, measured performance is |
| # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic |
| # scheduling "glitch," because uprofile(1) indicates uniform sample |
| # distribution, as if all instruction bundles execute in 1.5 cycles. |
| # Meaning that it could have been even faster, yet 12 cycles is ~60% |
| # better than gcc-generated code and ~80% than code generated by vendor |
| # compiler. |
| |
| $cnt="v0"; # $0 |
| $t0="t0"; |
| $t1="t1"; |
| $t2="t2"; |
| $Thi0="t3"; # $4 |
| $Tlo0="t4"; |
| $Thi1="t5"; |
| $Tlo1="t6"; |
| $rem="t7"; # $8 |
| ################# |
| $Xi="a0"; # $16, input argument block |
| $Htbl="a1"; |
| $inp="a2"; |
| $len="a3"; |
| $nlo="a4"; # $20 |
| $nhi="a5"; |
| $Zhi="t8"; |
| $Zlo="t9"; |
| $Xhi="t10"; # $24 |
| $Xlo="t11"; |
| $remp="t12"; |
| $rem_4bit="AT"; # $28 |
| |
| { my $N; |
| sub loop() { |
| |
| $N++; |
| $code.=<<___; |
| .align 4 |
| extbl $Xlo,7,$nlo |
| and $nlo,0xf0,$nhi |
| sll $nlo,4,$nlo |
| and $nlo,0xf0,$nlo |
| |
| addq $nlo,$Htbl,$nlo |
| ldq $Zlo,8($nlo) |
| addq $nhi,$Htbl,$nhi |
| ldq $Zhi,0($nlo) |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| lda $cnt,6(zero) |
| extbl $Xlo,6,$nlo |
| |
| ldq $Tlo1,8($nhi) |
| s8addq $remp,$rem_4bit,$remp |
| ldq $Thi1,0($nhi) |
| srl $Zlo,4,$Zlo |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $t0,$Zlo,$Zlo |
| and $nlo,0xf0,$nhi |
| |
| xor $Tlo1,$Zlo,$Zlo |
| sll $nlo,4,$nlo |
| xor $Thi1,$Zhi,$Zhi |
| and $nlo,0xf0,$nlo |
| |
| addq $nlo,$Htbl,$nlo |
| ldq $Tlo0,8($nlo) |
| addq $nhi,$Htbl,$nhi |
| ldq $Thi0,0($nlo) |
| |
| .Looplo$N: |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| subq $cnt,1,$cnt |
| srl $Zlo,4,$Zlo |
| |
| ldq $Tlo1,8($nhi) |
| xor $rem,$Zhi,$Zhi |
| ldq $Thi1,0($nhi) |
| s8addq $remp,$rem_4bit,$remp |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $t0,$Zlo,$Zlo |
| extbl $Xlo,$cnt,$nlo |
| |
| and $nlo,0xf0,$nhi |
| xor $Thi0,$Zhi,$Zhi |
| xor $Tlo0,$Zlo,$Zlo |
| sll $nlo,4,$nlo |
| |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| and $nlo,0xf0,$nlo |
| srl $Zlo,4,$Zlo |
| |
| s8addq $remp,$rem_4bit,$remp |
| xor $rem,$Zhi,$Zhi |
| addq $nlo,$Htbl,$nlo |
| addq $nhi,$Htbl,$nhi |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| ldq $Tlo0,8($nlo) |
| xor $t0,$Zlo,$Zlo |
| |
| xor $Tlo1,$Zlo,$Zlo |
| xor $Thi1,$Zhi,$Zhi |
| ldq $Thi0,0($nlo) |
| bne $cnt,.Looplo$N |
| |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| lda $cnt,7(zero) |
| srl $Zlo,4,$Zlo |
| |
| ldq $Tlo1,8($nhi) |
| xor $rem,$Zhi,$Zhi |
| ldq $Thi1,0($nhi) |
| s8addq $remp,$rem_4bit,$remp |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $t0,$Zlo,$Zlo |
| extbl $Xhi,$cnt,$nlo |
| |
| and $nlo,0xf0,$nhi |
| xor $Thi0,$Zhi,$Zhi |
| xor $Tlo0,$Zlo,$Zlo |
| sll $nlo,4,$nlo |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| and $nlo,0xf0,$nlo |
| srl $Zlo,4,$Zlo |
| |
| s8addq $remp,$rem_4bit,$remp |
| xor $rem,$Zhi,$Zhi |
| addq $nlo,$Htbl,$nlo |
| addq $nhi,$Htbl,$nhi |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| ldq $Tlo0,8($nlo) |
| xor $t0,$Zlo,$Zlo |
| |
| xor $Tlo1,$Zlo,$Zlo |
| xor $Thi1,$Zhi,$Zhi |
| ldq $Thi0,0($nlo) |
| unop |
| |
| |
| .Loophi$N: |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| subq $cnt,1,$cnt |
| srl $Zlo,4,$Zlo |
| |
| ldq $Tlo1,8($nhi) |
| xor $rem,$Zhi,$Zhi |
| ldq $Thi1,0($nhi) |
| s8addq $remp,$rem_4bit,$remp |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $t0,$Zlo,$Zlo |
| extbl $Xhi,$cnt,$nlo |
| |
| and $nlo,0xf0,$nhi |
| xor $Thi0,$Zhi,$Zhi |
| xor $Tlo0,$Zlo,$Zlo |
| sll $nlo,4,$nlo |
| |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| and $nlo,0xf0,$nlo |
| srl $Zlo,4,$Zlo |
| |
| s8addq $remp,$rem_4bit,$remp |
| xor $rem,$Zhi,$Zhi |
| addq $nlo,$Htbl,$nlo |
| addq $nhi,$Htbl,$nhi |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| ldq $Tlo0,8($nlo) |
| xor $t0,$Zlo,$Zlo |
| |
| xor $Tlo1,$Zlo,$Zlo |
| xor $Thi1,$Zhi,$Zhi |
| ldq $Thi0,0($nlo) |
| bne $cnt,.Loophi$N |
| |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| srl $Zlo,4,$Zlo |
| |
| ldq $Tlo1,8($nhi) |
| xor $rem,$Zhi,$Zhi |
| ldq $Thi1,0($nhi) |
| s8addq $remp,$rem_4bit,$remp |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $t0,$Zlo,$Zlo |
| |
| xor $Tlo0,$Zlo,$Zlo |
| xor $Thi0,$Zhi,$Zhi |
| |
| and $Zlo,0x0f,$remp |
| sll $Zhi,60,$t0 |
| srl $Zlo,4,$Zlo |
| |
| s8addq $remp,$rem_4bit,$remp |
| xor $rem,$Zhi,$Zhi |
| |
| ldq $rem,0($remp) |
| srl $Zhi,4,$Zhi |
| xor $Tlo1,$Zlo,$Zlo |
| xor $Thi1,$Zhi,$Zhi |
| xor $t0,$Zlo,$Zlo |
| xor $rem,$Zhi,$Zhi |
| ___ |
| }} |
| |
| $code=<<___; |
| #ifdef __linux__ |
| #include <asm/regdef.h> |
| #else |
| #include <asm.h> |
| #include <regdef.h> |
| #endif |
| |
| .text |
| |
| .set noat |
| .set noreorder |
| .globl gcm_gmult_4bit |
| .align 4 |
| .ent gcm_gmult_4bit |
| gcm_gmult_4bit: |
| .frame sp,0,ra |
| .prologue 0 |
| |
| ldq $Xlo,8($Xi) |
| ldq $Xhi,0($Xi) |
| |
| bsr $t0,picmeup |
| nop |
| ___ |
| |
| &loop(); |
| |
| $code.=<<___; |
| srl $Zlo,24,$t0 # byte swap |
| srl $Zlo,8,$t1 |
| |
| sll $Zlo,8,$t2 |
| sll $Zlo,24,$Zlo |
| zapnot $t0,0x11,$t0 |
| zapnot $t1,0x22,$t1 |
| |
| zapnot $Zlo,0x88,$Zlo |
| or $t0,$t1,$t0 |
| zapnot $t2,0x44,$t2 |
| |
| or $Zlo,$t0,$Zlo |
| srl $Zhi,24,$t0 |
| srl $Zhi,8,$t1 |
| |
| or $Zlo,$t2,$Zlo |
| sll $Zhi,8,$t2 |
| sll $Zhi,24,$Zhi |
| |
| srl $Zlo,32,$Xlo |
| sll $Zlo,32,$Zlo |
| |
| zapnot $t0,0x11,$t0 |
| zapnot $t1,0x22,$t1 |
| or $Zlo,$Xlo,$Xlo |
| |
| zapnot $Zhi,0x88,$Zhi |
| or $t0,$t1,$t0 |
| zapnot $t2,0x44,$t2 |
| |
| or $Zhi,$t0,$Zhi |
| or $Zhi,$t2,$Zhi |
| |
| srl $Zhi,32,$Xhi |
| sll $Zhi,32,$Zhi |
| |
| or $Zhi,$Xhi,$Xhi |
| stq $Xlo,8($Xi) |
| stq $Xhi,0($Xi) |
| |
| ret (ra) |
| .end gcm_gmult_4bit |
| ___ |
| |
| $inhi="s0"; |
| $inlo="s1"; |
| |
| $code.=<<___; |
| .globl gcm_ghash_4bit |
| .align 4 |
| .ent gcm_ghash_4bit |
| gcm_ghash_4bit: |
| lda sp,-32(sp) |
| stq ra,0(sp) |
| stq s0,8(sp) |
| stq s1,16(sp) |
| .mask 0x04000600,-32 |
| .frame sp,32,ra |
| .prologue 0 |
| |
| ldq_u $inhi,0($inp) |
| ldq_u $Thi0,7($inp) |
| ldq_u $inlo,8($inp) |
| ldq_u $Tlo0,15($inp) |
| ldq $Xhi,0($Xi) |
| ldq $Xlo,8($Xi) |
| |
| bsr $t0,picmeup |
| nop |
| |
| .Louter: |
| extql $inhi,$inp,$inhi |
| extqh $Thi0,$inp,$Thi0 |
| or $inhi,$Thi0,$inhi |
| lda $inp,16($inp) |
| |
| extql $inlo,$inp,$inlo |
| extqh $Tlo0,$inp,$Tlo0 |
| or $inlo,$Tlo0,$inlo |
| subq $len,16,$len |
| |
| xor $Xlo,$inlo,$Xlo |
| xor $Xhi,$inhi,$Xhi |
| ___ |
| |
| &loop(); |
| |
| $code.=<<___; |
| srl $Zlo,24,$t0 # byte swap |
| srl $Zlo,8,$t1 |
| |
| sll $Zlo,8,$t2 |
| sll $Zlo,24,$Zlo |
| zapnot $t0,0x11,$t0 |
| zapnot $t1,0x22,$t1 |
| |
| zapnot $Zlo,0x88,$Zlo |
| or $t0,$t1,$t0 |
| zapnot $t2,0x44,$t2 |
| |
| or $Zlo,$t0,$Zlo |
| srl $Zhi,24,$t0 |
| srl $Zhi,8,$t1 |
| |
| or $Zlo,$t2,$Zlo |
| sll $Zhi,8,$t2 |
| sll $Zhi,24,$Zhi |
| |
| srl $Zlo,32,$Xlo |
| sll $Zlo,32,$Zlo |
| beq $len,.Ldone |
| |
| zapnot $t0,0x11,$t0 |
| zapnot $t1,0x22,$t1 |
| or $Zlo,$Xlo,$Xlo |
| ldq_u $inhi,0($inp) |
| |
| zapnot $Zhi,0x88,$Zhi |
| or $t0,$t1,$t0 |
| zapnot $t2,0x44,$t2 |
| ldq_u $Thi0,7($inp) |
| |
| or $Zhi,$t0,$Zhi |
| or $Zhi,$t2,$Zhi |
| ldq_u $inlo,8($inp) |
| ldq_u $Tlo0,15($inp) |
| |
| srl $Zhi,32,$Xhi |
| sll $Zhi,32,$Zhi |
| |
| or $Zhi,$Xhi,$Xhi |
| br zero,.Louter |
| |
| .Ldone: |
| zapnot $t0,0x11,$t0 |
| zapnot $t1,0x22,$t1 |
| or $Zlo,$Xlo,$Xlo |
| |
| zapnot $Zhi,0x88,$Zhi |
| or $t0,$t1,$t0 |
| zapnot $t2,0x44,$t2 |
| |
| or $Zhi,$t0,$Zhi |
| or $Zhi,$t2,$Zhi |
| |
| srl $Zhi,32,$Xhi |
| sll $Zhi,32,$Zhi |
| |
| or $Zhi,$Xhi,$Xhi |
| |
| stq $Xlo,8($Xi) |
| stq $Xhi,0($Xi) |
| |
| .set noreorder |
| /*ldq ra,0(sp)*/ |
| ldq s0,8(sp) |
| ldq s1,16(sp) |
| lda sp,32(sp) |
| ret (ra) |
| .end gcm_ghash_4bit |
| |
| .align 4 |
| .ent picmeup |
| picmeup: |
| .frame sp,0,$t0 |
| .prologue 0 |
| br $rem_4bit,.Lpic |
| .Lpic: lda $rem_4bit,12($rem_4bit) |
| ret ($t0) |
| .end picmeup |
| nop |
| rem_4bit: |
| .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 |
| .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 |
| .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 |
| .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 |
| .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" |
| .align 4 |
| |
| ___ |
| $output=pop and open STDOUT,">$output"; |
| print $code; |
| close STDOUT or die "error closing STDOUT: $!"; |
| |