| #! /usr/bin/env perl |
| # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License 2.0 (the "License"). You may not use |
| # this file except in compliance with the License. You can obtain a copy |
| # in the file LICENSE in the source distribution or at |
| # https://www.openssl.org/source/license.html |
| |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # February 2012 |
| # |
| # The module implements bn_GF2m_mul_2x2 polynomial multiplication |
| # used in bn_gf2m.c. It's kind of low-hanging mechanical port from |
| # C for the time being... The subroutine runs in 37 cycles, which is |
| # 4.5x faster than compiler-generated code. Though comparison is |
| # totally unfair, because this module utilizes Galois Field Multiply |
| # instruction. |
| |
| $output = pop and open STDOUT,">$output"; |
| |
| ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector |
| |
| ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); |
| ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); |
| ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); |
| ($A,$B)=($Alo,$B_1); |
| $xFF="B1"; |
| |
| sub mul_1x1_upper { |
| my ($A,$B)=@_; |
| $code.=<<___; |
| EXTU $B,8,24,$B_2 ; smash $B to 4 bytes |
| || AND $B,$xFF,$B_0 |
| || SHRU $B,24,$B_3 |
| SHRU $A,16, $Ahi ; smash $A to two halfwords |
| || EXTU $A,16,16,$Alo |
| |
| XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication |
| || XORMPY $Ahi,$B_2,$Ahix2 |
| || EXTU $B,16,24,$B_1 |
| XORMPY $Alo,$B_0,$Alox0 |
| || XORMPY $Ahi,$B_0,$Ahix0 |
| XORMPY $Alo,$B_3,$Alox3 |
| || XORMPY $Ahi,$B_3,$Ahix3 |
| XORMPY $Alo,$B_1,$Alox1 |
| || XORMPY $Ahi,$B_1,$Ahix1 |
| ___ |
| } |
| sub mul_1x1_merged { |
| my ($OUTlo,$OUThi,$A,$B)=@_; |
| $code.=<<___; |
| EXTU $B,8,24,$B_2 ; smash $B to 4 bytes |
| || AND $B,$xFF,$B_0 |
| || SHRU $B,24,$B_3 |
| SHRU $A,16, $Ahi ; smash $A to two halfwords |
| || EXTU $A,16,16,$Alo |
| |
| XOR $Ahix0,$Alox2,$Ahix0 |
| || MV $Ahix2,$OUThi |
| || XORMPY $Alo,$B_2,$Alox2 |
| XORMPY $Ahi,$B_2,$Ahix2 |
| || EXTU $B,16,24,$B_1 |
| || XORMPY $Alo,$B_0,A1 ; $Alox0 |
| XOR $Ahix1,$Alox3,$Ahix1 |
| || SHL $Ahix0,16,$OUTlo |
| || SHRU $Ahix0,16,$Ahix0 |
| XOR $Alox0,$OUTlo,$OUTlo |
| || XOR $Ahix0,$OUThi,$OUThi |
| || XORMPY $Ahi,$B_0,$Ahix0 |
| || XORMPY $Alo,$B_3,$Alox3 |
| || SHL $Alox1,8,$Alox1 |
| || SHL $Ahix3,8,$Ahix3 |
| XOR $Alox1,$OUTlo,$OUTlo |
| || XOR $Ahix3,$OUThi,$OUThi |
| || XORMPY $Ahi,$B_3,$Ahix3 |
| || SHL $Ahix1,24,$Alox1 |
| || SHRU $Ahix1,8, $Ahix1 |
| XOR $Alox1,$OUTlo,$OUTlo |
| || XOR $Ahix1,$OUThi,$OUThi |
| || XORMPY $Alo,$B_1,$Alox1 |
| || XORMPY $Ahi,$B_1,$Ahix1 |
| || MV A1,$Alox0 |
| ___ |
| } |
| sub mul_1x1_lower { |
| my ($OUTlo,$OUThi)=@_; |
| $code.=<<___; |
| ;NOP |
| XOR $Ahix0,$Alox2,$Ahix0 |
| || MV $Ahix2,$OUThi |
| NOP |
| XOR $Ahix1,$Alox3,$Ahix1 |
| || SHL $Ahix0,16,$OUTlo |
| || SHRU $Ahix0,16,$Ahix0 |
| XOR $Alox0,$OUTlo,$OUTlo |
| || XOR $Ahix0,$OUThi,$OUThi |
| || SHL $Alox1,8,$Alox1 |
| || SHL $Ahix3,8,$Ahix3 |
| XOR $Alox1,$OUTlo,$OUTlo |
| || XOR $Ahix3,$OUThi,$OUThi |
| || SHL $Ahix1,24,$Alox1 |
| || SHRU $Ahix1,8, $Ahix1 |
| XOR $Alox1,$OUTlo,$OUTlo |
| || XOR $Ahix1,$OUThi,$OUThi |
| ___ |
| } |
| $code.=<<___; |
| .text |
| |
| .if .ASSEMBLER_VERSION<7000000 |
| .asg 0,__TI_EABI__ |
| .endif |
| .if __TI_EABI__ |
| .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 |
| .endif |
| |
| .global _bn_GF2m_mul_2x2 |
| _bn_GF2m_mul_2x2: |
| .asmfunc |
| MVK 0xFF,$xFF |
| ___ |
| &mul_1x1_upper($a0,$b0); # a0·b0 |
| $code.=<<___; |
| || MV $b1,$B |
| MV $a1,$A |
| ___ |
| &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 |
| $code.=<<___; |
| || XOR $b0,$b1,$B |
| XOR $a0,$a1,$A |
| ___ |
| &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) |
| $code.=<<___; |
| XOR A28,A31,A29 |
| || XOR B28,B31,B29 ; a0·b0+a1·b1 |
| ___ |
| &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) |
| $code.=<<___; |
| || BNOP B3 |
| XOR A29,A30,A30 |
| || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 |
| XOR B28,A30,A30 |
| || STW A28,*${rp}[0] |
| XOR B30,A31,A31 |
| || STW A30,*${rp}[1] |
| STW A31,*${rp}[2] |
| STW B31,*${rp}[3] |
| .endasmfunc |
| ___ |
| |
| print $code; |
| close STDOUT or die "error closing STDOUT: $!"; |