crypto/bn/asm/c64xplus-gf2m.pl - third_party/openssl - Git at Google

 #! /usr/bin/env perl
 # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html

 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # February 2012
 #
 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
 # C for the time being... The subroutine runs in 37 cycles, which is
 # 4.5x faster than compiler-generated code. Though comparison is
 # totally unfair, because this module utilizes Galois Field Multiply
 # instruction.

 $output = pop and open STDOUT,">$output";

 ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector

 ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
 ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
 ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
 ($A,$B)=($Alo,$B_1);
 $xFF="B1";

 sub mul_1x1_upper {
 my ($A,$B)=@_;
 $code.=<<___;
 	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
 ||	AND	$B,$xFF,$B_0
 ||	SHRU	$B,24,$B_3
 	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
 ||	EXTU	$A,16,16,$Alo

 	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits multiplication
 ||	XORMPY	$Ahi,$B_2,$Ahix2
 ||	EXTU	$B,16,24,$B_1
 	XORMPY	$Alo,$B_0,$Alox0
 ||	XORMPY	$Ahi,$B_0,$Ahix0
 	XORMPY	$Alo,$B_3,$Alox3
 ||	XORMPY	$Ahi,$B_3,$Ahix3
 	XORMPY	$Alo,$B_1,$Alox1
 ||	XORMPY	$Ahi,$B_1,$Ahix1
 ___
 }
 sub mul_1x1_merged {
 my ($OUTlo,$OUThi,$A,$B)=@_;
 $code.=<<___;
 	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
 ||	 AND	$B,$xFF,$B_0
 ||	 SHRU	$B,24,$B_3
 	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
 ||	 EXTU	$A,16,16,$Alo

 	XOR	$Ahix0,$Alox2,$Ahix0
 ||	MV	$Ahix2,$OUThi
 ||	 XORMPY	$Alo,$B_2,$Alox2
 	 XORMPY	$Ahi,$B_2,$Ahix2
 ||	 EXTU	$B,16,24,$B_1
 ||	 XORMPY	$Alo,$B_0,A1		; $Alox0
 	XOR	$Ahix1,$Alox3,$Ahix1
 ||	SHL	$Ahix0,16,$OUTlo
 ||	SHRU	$Ahix0,16,$Ahix0
 	XOR	$Alox0,$OUTlo,$OUTlo
 ||	XOR	$Ahix0,$OUThi,$OUThi
 ||	 XORMPY	$Ahi,$B_0,$Ahix0
 ||	 XORMPY	$Alo,$B_3,$Alox3
 ||	SHL	$Alox1,8,$Alox1
 ||	SHL	$Ahix3,8,$Ahix3
 	XOR	$Alox1,$OUTlo,$OUTlo
 ||	XOR	$Ahix3,$OUThi,$OUThi
 ||	 XORMPY	$Ahi,$B_3,$Ahix3
 ||	SHL	$Ahix1,24,$Alox1
 ||	SHRU	$Ahix1,8, $Ahix1
 	XOR	$Alox1,$OUTlo,$OUTlo
 ||	XOR	$Ahix1,$OUThi,$OUThi
 ||	 XORMPY	$Alo,$B_1,$Alox1
 ||	 XORMPY	$Ahi,$B_1,$Ahix1
 ||	 MV	A1,$Alox0
 ___
 }
 sub mul_1x1_lower {
 my ($OUTlo,$OUThi)=@_;
 $code.=<<___;
 	;NOP
 	XOR	$Ahix0,$Alox2,$Ahix0
 ||	MV	$Ahix2,$OUThi
 	NOP
 	XOR	$Ahix1,$Alox3,$Ahix1
 ||	SHL	$Ahix0,16,$OUTlo
 ||	SHRU	$Ahix0,16,$Ahix0
 	XOR	$Alox0,$OUTlo,$OUTlo
 ||	XOR	$Ahix0,$OUThi,$OUThi
 ||	SHL	$Alox1,8,$Alox1
 ||	SHL	$Ahix3,8,$Ahix3
 	XOR	$Alox1,$OUTlo,$OUTlo
 ||	XOR	$Ahix3,$OUThi,$OUThi
 ||	SHL	$Ahix1,24,$Alox1
 ||	SHRU	$Ahix1,8, $Ahix1
 	XOR	$Alox1,$OUTlo,$OUTlo
 ||	XOR	$Ahix1,$OUThi,$OUThi
 ___
 }
 $code.=<<___;
 	.text

 	.if	.ASSEMBLER_VERSION<7000000
 	.asg	0,__TI_EABI__
 	.endif
 	.if	__TI_EABI__
 	.asg	bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
 	.endif

 	.global	_bn_GF2m_mul_2x2
 _bn_GF2m_mul_2x2:
 	.asmfunc
 	MVK	0xFF,$xFF
 ___
 	&mul_1x1_upper($a0,$b0);		# a0·b0
 $code.=<<___;
 ||	MV	$b1,$B
 	MV	$a1,$A
 ___
 	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
 $code.=<<___;
 ||	XOR	$b0,$b1,$B
 	XOR	$a0,$a1,$A
 ___
 	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
 $code.=<<___;
 	XOR	A28,A31,A29
 ||	XOR	B28,B31,B29			; a0·b0+a1·b1
 ___
 	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
 $code.=<<___;
 ||	BNOP	B3
 	XOR	A29,A30,A30
 ||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
 	XOR	B28,A30,A30
 ||	STW	A28,*${rp}[0]
 	XOR	B30,A31,A31
 ||	STW	A30,*${rp}[1]
 	STW	A31,*${rp}[2]
 	STW	B31,*${rp}[3]
 	.endasmfunc
 ___

 print $code;
 close STDOUT or die "error closing STDOUT: $!";
	#! /usr/bin/env perl
	# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
	#
	# Licensed under the Apache License 2.0 (the "License"). You may not use
	# this file except in compliance with the License. You can obtain a copy
	# in the file LICENSE in the source distribution or at
	# https://www.openssl.org/source/license.html

	#
	# ====================================================================
	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	# project. The module is, however, dual licensed under OpenSSL and
	# CRYPTOGAMS licenses depending on where you obtain it. For further
	# details see http://www.openssl.org/~appro/cryptogams/.
	# ====================================================================
	#
	# February 2012
	#
	# The module implements bn_GF2m_mul_2x2 polynomial multiplication
	# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
	# C for the time being... The subroutine runs in 37 cycles, which is
	# 4.5x faster than compiler-generated code. Though comparison is
	# totally unfair, because this module utilizes Galois Field Multiply
	# instruction.

	$output = pop and open STDOUT,">$output";

	($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector

	($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
	($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
	($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
	($A,$B)=($Alo,$B_1);
	$xFF="B1";

	sub mul_1x1_upper {
	my ($A,$B)=@_;
	$code.=<<___;
	EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
	\|\| AND $B,$xFF,$B_0
	\|\| SHRU $B,24,$B_3
	SHRU $A,16, $Ahi ; smash $A to two halfwords
	\|\| EXTU $A,16,16,$Alo

	XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
	\|\| XORMPY $Ahi,$B_2,$Ahix2
	\|\| EXTU $B,16,24,$B_1
	XORMPY $Alo,$B_0,$Alox0
	\|\| XORMPY $Ahi,$B_0,$Ahix0
	XORMPY $Alo,$B_3,$Alox3
	\|\| XORMPY $Ahi,$B_3,$Ahix3
	XORMPY $Alo,$B_1,$Alox1
	\|\| XORMPY $Ahi,$B_1,$Ahix1
	___
	}
	sub mul_1x1_merged {
	my ($OUTlo,$OUThi,$A,$B)=@_;
	$code.=<<___;
	EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
	\|\| AND $B,$xFF,$B_0
	\|\| SHRU $B,24,$B_3
	SHRU $A,16, $Ahi ; smash $A to two halfwords
	\|\| EXTU $A,16,16,$Alo

	XOR $Ahix0,$Alox2,$Ahix0
	\|\| MV $Ahix2,$OUThi
	\|\| XORMPY $Alo,$B_2,$Alox2
	XORMPY $Ahi,$B_2,$Ahix2
	\|\| EXTU $B,16,24,$B_1
	\|\| XORMPY $Alo,$B_0,A1 ; $Alox0
	XOR $Ahix1,$Alox3,$Ahix1
	\|\| SHL $Ahix0,16,$OUTlo
	\|\| SHRU $Ahix0,16,$Ahix0
	XOR $Alox0,$OUTlo,$OUTlo
	\|\| XOR $Ahix0,$OUThi,$OUThi
	\|\| XORMPY $Ahi,$B_0,$Ahix0
	\|\| XORMPY $Alo,$B_3,$Alox3
	\|\| SHL $Alox1,8,$Alox1
	\|\| SHL $Ahix3,8,$Ahix3
	XOR $Alox1,$OUTlo,$OUTlo
	\|\| XOR $Ahix3,$OUThi,$OUThi
	\|\| XORMPY $Ahi,$B_3,$Ahix3
	\|\| SHL $Ahix1,24,$Alox1
	\|\| SHRU $Ahix1,8, $Ahix1
	XOR $Alox1,$OUTlo,$OUTlo
	\|\| XOR $Ahix1,$OUThi,$OUThi
	\|\| XORMPY $Alo,$B_1,$Alox1
	\|\| XORMPY $Ahi,$B_1,$Ahix1
	\|\| MV A1,$Alox0
	___
	}
	sub mul_1x1_lower {
	my ($OUTlo,$OUThi)=@_;
	$code.=<<___;
	;NOP
	XOR $Ahix0,$Alox2,$Ahix0
	\|\| MV $Ahix2,$OUThi
	NOP
	XOR $Ahix1,$Alox3,$Ahix1
	\|\| SHL $Ahix0,16,$OUTlo
	\|\| SHRU $Ahix0,16,$Ahix0
	XOR $Alox0,$OUTlo,$OUTlo
	\|\| XOR $Ahix0,$OUThi,$OUThi
	\|\| SHL $Alox1,8,$Alox1
	\|\| SHL $Ahix3,8,$Ahix3
	XOR $Alox1,$OUTlo,$OUTlo
	\|\| XOR $Ahix3,$OUThi,$OUThi
	\|\| SHL $Ahix1,24,$Alox1
	\|\| SHRU $Ahix1,8, $Ahix1
	XOR $Alox1,$OUTlo,$OUTlo
	\|\| XOR $Ahix1,$OUThi,$OUThi
	___
	}
	$code.=<<___;
	.text

	.if .ASSEMBLER_VERSION<7000000
	.asg 0,__TI_EABI__
	.endif
	.if __TI_EABI__
	.asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
	.endif

	.global _bn_GF2m_mul_2x2
	_bn_GF2m_mul_2x2:
	.asmfunc
	MVK 0xFF,$xFF
	___
	&mul_1x1_upper($a0,$b0); # a0·b0
	$code.=<<___;
	\|\| MV $b1,$B
	MV $a1,$A
	___
	&mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
	$code.=<<___;
	\|\| XOR $b0,$b1,$B
	XOR $a0,$a1,$A
	___
	&mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
	$code.=<<___;
	XOR A28,A31,A29
	\|\| XOR B28,B31,B29 ; a0·b0+a1·b1
	___
	&mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
	$code.=<<___;
	\|\| BNOP B3
	XOR A29,A30,A30
	\|\| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
	XOR B28,A30,A30
	\|\| STW A28,*${rp}[0]
	XOR B30,A31,A31
	\|\| STW A30,*${rp}[1]
	STW A31,*${rp}[2]
	STW B31,*${rp}[3]
	.endasmfunc
	___

	print $code;
	close STDOUT or die "error closing STDOUT: $!";