| #!/usr/bin/env perl | 
 | # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. | 
 | # | 
 | # Licensed under the Apache License 2.0 (the "License").  You may not use | 
 | # this file except in compliance with the License.  You can obtain a copy | 
 | # in the file LICENSE in the source distribution or at | 
 | # https://www.openssl.org/source/license.html | 
 | # | 
 | # ==================================================================== | 
 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | 
 | # project. The module is, however, dual licensed under OpenSSL and | 
 | # CRYPTOGAMS licenses depending on where you obtain it. For further | 
 | # details see http://www.openssl.org/~appro/cryptogams/. | 
 | # ==================================================================== | 
 | # | 
 | # Keccak-1600 for ARMv8. | 
 | # | 
 | # June 2017. | 
 | # | 
 | # This is straightforward KECCAK_1X_ALT implementation. It makes no | 
 | # sense to attempt SIMD/NEON implementation for following reason. | 
 | # 64-bit lanes of vector registers can't be addressed as easily as in | 
 | # 32-bit mode. This means that 64-bit NEON is bound to be slower than | 
 | # 32-bit NEON, and this implementation is faster than 32-bit NEON on | 
 | # same processor. Even though it takes more scalar xor's and andn's, | 
 | # it gets compensated by availability of rotate. Not to forget that | 
 | # most processors achieve higher issue rate with scalar instructions. | 
 | # | 
 | # February 2018. | 
 | # | 
 | # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT | 
 | # variant with register permutation/rotation twist that allows to | 
 | # eliminate copies to temporary registers. If you look closely you'll | 
 | # notice that it uses only one lane of vector registers. The new | 
 | # instructions effectively facilitate parallel hashing, which we don't | 
 | # support [yet?]. But lowest-level core procedure is prepared for it. | 
 | # The inner round is 67 [vector] instructions, so it's not actually | 
 | # obvious that it will provide performance improvement [in serial | 
 | # hash] as long as vector instructions issue rate is limited to 1 per | 
 | # cycle... | 
 | # | 
 | ###################################################################### | 
 | # Numbers are cycles per processed byte. | 
 | # | 
 | #		r=1088(*) | 
 | # | 
 | # Cortex-A53	13 | 
 | # Cortex-A57	12 | 
 | # X-Gene	14 | 
 | # Mongoose	10 | 
 | # Kryo		12 | 
 | # Denver	7.8 | 
 | # Apple A7	7.2 | 
 | # ThunderX2	9.7 | 
 | # | 
 | # (*)	Corresponds to SHA3-256. No improvement coefficients are listed | 
 | #	because they vary too much from compiler to compiler. Newer | 
 | #	compiler does much better and improvement varies from 5% on | 
 | #	Cortex-A57 to 25% on Cortex-A53. While in comparison to older | 
 | #	compiler this code is at least 2x faster... | 
 |  | 
 | # $output is the last argument if it looks like a file (it has an extension) | 
 | # $flavour is the first argument if it doesn't look like a file | 
 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | 
 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | 
 |  | 
 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 
 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | 
 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | 
 | die "can't locate arm-xlate.pl"; | 
 |  | 
 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" | 
 |     or die "can't call $xlate: $!"; | 
 | *STDOUT=*OUT; | 
 |  | 
 | my @rhotates = ([  0,  1, 62, 28, 27 ], | 
 |                 [ 36, 44,  6, 55, 20 ], | 
 |                 [  3, 10, 43, 25, 39 ], | 
 |                 [ 41, 45, 15, 21,  8 ], | 
 |                 [ 18,  2, 61, 56, 14 ]); | 
 |  | 
 | $code.=<<___; | 
 | #include "arm_arch.h" | 
 |  | 
 | .text | 
 |  | 
 | .align 8	// strategic alignment and padding that allows to use | 
 | 		// address value as loop termination condition... | 
 | 	.quad	0,0,0,0,0,0,0,0 | 
 | .type	iotas,%object | 
 | iotas: | 
 | 	.quad	0x0000000000000001 | 
 | 	.quad	0x0000000000008082 | 
 | 	.quad	0x800000000000808a | 
 | 	.quad	0x8000000080008000 | 
 | 	.quad	0x000000000000808b | 
 | 	.quad	0x0000000080000001 | 
 | 	.quad	0x8000000080008081 | 
 | 	.quad	0x8000000000008009 | 
 | 	.quad	0x000000000000008a | 
 | 	.quad	0x0000000000000088 | 
 | 	.quad	0x0000000080008009 | 
 | 	.quad	0x000000008000000a | 
 | 	.quad	0x000000008000808b | 
 | 	.quad	0x800000000000008b | 
 | 	.quad	0x8000000000008089 | 
 | 	.quad	0x8000000000008003 | 
 | 	.quad	0x8000000000008002 | 
 | 	.quad	0x8000000000000080 | 
 | 	.quad	0x000000000000800a | 
 | 	.quad	0x800000008000000a | 
 | 	.quad	0x8000000080008081 | 
 | 	.quad	0x8000000000008080 | 
 | 	.quad	0x0000000080000001 | 
 | 	.quad	0x8000000080008008 | 
 | .size	iotas,.-iotas | 
 | ___ | 
 | 								{{{ | 
 | my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], | 
 |             (0, 5, 10, 15, 20)); | 
 |    $A[3][3] = "x25"; # x18 is reserved | 
 |  | 
 | my @C = map("x$_", (26,27,28,30)); | 
 |  | 
 | $code.=<<___; | 
 | .type	KeccakF1600_int,%function | 
 | .align	5 | 
 | KeccakF1600_int: | 
 | 	adr	$C[2],iotas | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine | 
 | 	b	.Loop | 
 | .align	4 | 
 | .Loop: | 
 | 	////////////////////////////////////////// Theta | 
 | 	eor	$C[0],$A[0][0],$A[1][0] | 
 | 	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair... | 
 | 	eor	$C[1],$A[0][1],$A[1][1] | 
 | 	eor	$C[2],$A[0][2],$A[1][2] | 
 | 	eor	$C[3],$A[0][3],$A[1][3] | 
 | ___ | 
 | 	$C[4]=$A[0][4]; | 
 | 	$C[5]=$A[1][4]; | 
 | $code.=<<___; | 
 | 	eor	$C[4],$A[0][4],$A[1][4] | 
 | 	eor	$C[0],$C[0],$A[2][0] | 
 | 	eor	$C[1],$C[1],$A[2][1] | 
 | 	eor	$C[2],$C[2],$A[2][2] | 
 | 	eor	$C[3],$C[3],$A[2][3] | 
 | 	eor	$C[4],$C[4],$A[2][4] | 
 | 	eor	$C[0],$C[0],$A[3][0] | 
 | 	eor	$C[1],$C[1],$A[3][1] | 
 | 	eor	$C[2],$C[2],$A[3][2] | 
 | 	eor	$C[3],$C[3],$A[3][3] | 
 | 	eor	$C[4],$C[4],$A[3][4] | 
 | 	eor	$C[0],$C[0],$A[4][0] | 
 | 	eor	$C[2],$C[2],$A[4][2] | 
 | 	eor	$C[1],$C[1],$A[4][1] | 
 | 	eor	$C[3],$C[3],$A[4][3] | 
 | 	eor	$C[4],$C[4],$A[4][4] | 
 |  | 
 | 	eor	$C[5],$C[0],$C[2],ror#63 | 
 |  | 
 | 	eor	$A[0][1],$A[0][1],$C[5] | 
 | 	eor	$A[1][1],$A[1][1],$C[5] | 
 | 	eor	$A[2][1],$A[2][1],$C[5] | 
 | 	eor	$A[3][1],$A[3][1],$C[5] | 
 | 	eor	$A[4][1],$A[4][1],$C[5] | 
 |  | 
 | 	eor	$C[5],$C[1],$C[3],ror#63 | 
 | 	eor	$C[2],$C[2],$C[4],ror#63 | 
 | 	eor	$C[3],$C[3],$C[0],ror#63 | 
 | 	eor	$C[4],$C[4],$C[1],ror#63 | 
 |  | 
 | 	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2] | 
 | 	eor	$A[1][2],$A[1][2],$C[5] | 
 | 	eor	$A[2][2],$A[2][2],$C[5] | 
 | 	eor	$A[3][2],$A[3][2],$C[5] | 
 | 	eor	$A[4][2],$A[4][2],$C[5] | 
 |  | 
 | 	eor	$A[0][0],$A[0][0],$C[4] | 
 | 	eor	$A[1][0],$A[1][0],$C[4] | 
 | 	eor	$A[2][0],$A[2][0],$C[4] | 
 | 	eor	$A[3][0],$A[3][0],$C[4] | 
 | 	eor	$A[4][0],$A[4][0],$C[4] | 
 | ___ | 
 | 	$C[4]=undef; | 
 | 	$C[5]=undef; | 
 | $code.=<<___; | 
 | 	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data | 
 | 	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3] | 
 | 	eor	$A[1][3],$A[1][3],$C[2] | 
 | 	eor	$A[2][3],$A[2][3],$C[2] | 
 | 	eor	$A[3][3],$A[3][3],$C[2] | 
 | 	eor	$A[4][3],$A[4][3],$C[2] | 
 |  | 
 | 	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4] | 
 | 	eor	$A[1][4],$A[1][4],$C[3] | 
 | 	eor	$A[2][4],$A[2][4],$C[3] | 
 | 	eor	$A[3][4],$A[3][4],$C[3] | 
 | 	eor	$A[4][4],$A[4][4],$C[3] | 
 |  | 
 | 	////////////////////////////////////////// Rho+Pi | 
 | 	mov	$C[3],$A[0][1] | 
 | 	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1] | 
 | 	//mov	$C[1],$A[0][2] | 
 | 	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2] | 
 | 	//mov	$C[0],$A[0][3] | 
 | 	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3] | 
 | 	//mov	$C[2],$A[0][4] | 
 | 	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4] | 
 |  | 
 | 	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4] | 
 | 	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3] | 
 | 	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2] | 
 | 	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1] | 
 |  | 
 | 	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2] | 
 | 	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4] | 
 | 	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1] | 
 | 	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3] | 
 |  | 
 | 	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4] | 
 | 	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3] | 
 | 	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2] | 
 | 	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1] | 
 |  | 
 | 	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0] | 
 | 	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0] | 
 | 	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0] | 
 | 	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0] | 
 |  | 
 | 	ror	$A[1][0],$C[0],#64-$rhotates[0][3] | 
 | 	ror	$A[2][0],$C[3],#64-$rhotates[0][1] | 
 | 	ror	$A[3][0],$C[2],#64-$rhotates[0][4] | 
 | 	ror	$A[4][0],$C[1],#64-$rhotates[0][2] | 
 |  | 
 | 	////////////////////////////////////////// Chi+Iota | 
 | 	bic	$C[0],$A[0][2],$A[0][1] | 
 | 	bic	$C[1],$A[0][3],$A[0][2] | 
 | 	bic	$C[2],$A[0][0],$A[0][4] | 
 | 	bic	$C[3],$A[0][1],$A[0][0] | 
 | 	eor	$A[0][0],$A[0][0],$C[0] | 
 | 	bic	$C[0],$A[0][4],$A[0][3] | 
 | 	eor	$A[0][1],$A[0][1],$C[1] | 
 | 	 ldr	$C[1],[sp,#16] | 
 | 	eor	$A[0][3],$A[0][3],$C[2] | 
 | 	eor	$A[0][4],$A[0][4],$C[3] | 
 | 	eor	$A[0][2],$A[0][2],$C[0] | 
 | 	 ldr	$C[3],[$C[1]],#8		// Iota[i++] | 
 |  | 
 | 	bic	$C[0],$A[1][2],$A[1][1] | 
 | 	 tst	$C[1],#255			// are we done? | 
 | 	 str	$C[1],[sp,#16] | 
 | 	bic	$C[1],$A[1][3],$A[1][2] | 
 | 	bic	$C[2],$A[1][0],$A[1][4] | 
 | 	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota | 
 | 	bic	$C[3],$A[1][1],$A[1][0] | 
 | 	eor	$A[1][0],$A[1][0],$C[0] | 
 | 	bic	$C[0],$A[1][4],$A[1][3] | 
 | 	eor	$A[1][1],$A[1][1],$C[1] | 
 | 	eor	$A[1][3],$A[1][3],$C[2] | 
 | 	eor	$A[1][4],$A[1][4],$C[3] | 
 | 	eor	$A[1][2],$A[1][2],$C[0] | 
 |  | 
 | 	bic	$C[0],$A[2][2],$A[2][1] | 
 | 	bic	$C[1],$A[2][3],$A[2][2] | 
 | 	bic	$C[2],$A[2][0],$A[2][4] | 
 | 	bic	$C[3],$A[2][1],$A[2][0] | 
 | 	eor	$A[2][0],$A[2][0],$C[0] | 
 | 	bic	$C[0],$A[2][4],$A[2][3] | 
 | 	eor	$A[2][1],$A[2][1],$C[1] | 
 | 	eor	$A[2][3],$A[2][3],$C[2] | 
 | 	eor	$A[2][4],$A[2][4],$C[3] | 
 | 	eor	$A[2][2],$A[2][2],$C[0] | 
 |  | 
 | 	bic	$C[0],$A[3][2],$A[3][1] | 
 | 	bic	$C[1],$A[3][3],$A[3][2] | 
 | 	bic	$C[2],$A[3][0],$A[3][4] | 
 | 	bic	$C[3],$A[3][1],$A[3][0] | 
 | 	eor	$A[3][0],$A[3][0],$C[0] | 
 | 	bic	$C[0],$A[3][4],$A[3][3] | 
 | 	eor	$A[3][1],$A[3][1],$C[1] | 
 | 	eor	$A[3][3],$A[3][3],$C[2] | 
 | 	eor	$A[3][4],$A[3][4],$C[3] | 
 | 	eor	$A[3][2],$A[3][2],$C[0] | 
 |  | 
 | 	bic	$C[0],$A[4][2],$A[4][1] | 
 | 	bic	$C[1],$A[4][3],$A[4][2] | 
 | 	bic	$C[2],$A[4][0],$A[4][4] | 
 | 	bic	$C[3],$A[4][1],$A[4][0] | 
 | 	eor	$A[4][0],$A[4][0],$C[0] | 
 | 	bic	$C[0],$A[4][4],$A[4][3] | 
 | 	eor	$A[4][1],$A[4][1],$C[1] | 
 | 	eor	$A[4][3],$A[4][3],$C[2] | 
 | 	eor	$A[4][4],$A[4][4],$C[3] | 
 | 	eor	$A[4][2],$A[4][2],$C[0] | 
 |  | 
 | 	bne	.Loop | 
 |  | 
 | 	ldr	x30,[sp,#24] | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	KeccakF1600_int,.-KeccakF1600_int | 
 |  | 
 | .type	KeccakF1600,%function | 
 | .align	5 | 
 | KeccakF1600: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-128]! | 
 | 	add	x29,sp,#0 | 
 | 	stp	x19,x20,[sp,#16] | 
 | 	stp	x21,x22,[sp,#32] | 
 | 	stp	x23,x24,[sp,#48] | 
 | 	stp	x25,x26,[sp,#64] | 
 | 	stp	x27,x28,[sp,#80] | 
 | 	sub	sp,sp,#48 | 
 |  | 
 | 	str	x0,[sp,#32]			// offload argument | 
 | 	mov	$C[0],x0 | 
 | 	ldp	$A[0][0],$A[0][1],[x0,#16*0] | 
 | 	ldp	$A[0][2],$A[0][3],[$C[0],#16*1] | 
 | 	ldp	$A[0][4],$A[1][0],[$C[0],#16*2] | 
 | 	ldp	$A[1][1],$A[1][2],[$C[0],#16*3] | 
 | 	ldp	$A[1][3],$A[1][4],[$C[0],#16*4] | 
 | 	ldp	$A[2][0],$A[2][1],[$C[0],#16*5] | 
 | 	ldp	$A[2][2],$A[2][3],[$C[0],#16*6] | 
 | 	ldp	$A[2][4],$A[3][0],[$C[0],#16*7] | 
 | 	ldp	$A[3][1],$A[3][2],[$C[0],#16*8] | 
 | 	ldp	$A[3][3],$A[3][4],[$C[0],#16*9] | 
 | 	ldp	$A[4][0],$A[4][1],[$C[0],#16*10] | 
 | 	ldp	$A[4][2],$A[4][3],[$C[0],#16*11] | 
 | 	ldr	$A[4][4],[$C[0],#16*12] | 
 |  | 
 | 	bl	KeccakF1600_int | 
 |  | 
 | 	ldr	$C[0],[sp,#32] | 
 | 	stp	$A[0][0],$A[0][1],[$C[0],#16*0] | 
 | 	stp	$A[0][2],$A[0][3],[$C[0],#16*1] | 
 | 	stp	$A[0][4],$A[1][0],[$C[0],#16*2] | 
 | 	stp	$A[1][1],$A[1][2],[$C[0],#16*3] | 
 | 	stp	$A[1][3],$A[1][4],[$C[0],#16*4] | 
 | 	stp	$A[2][0],$A[2][1],[$C[0],#16*5] | 
 | 	stp	$A[2][2],$A[2][3],[$C[0],#16*6] | 
 | 	stp	$A[2][4],$A[3][0],[$C[0],#16*7] | 
 | 	stp	$A[3][1],$A[3][2],[$C[0],#16*8] | 
 | 	stp	$A[3][3],$A[3][4],[$C[0],#16*9] | 
 | 	stp	$A[4][0],$A[4][1],[$C[0],#16*10] | 
 | 	stp	$A[4][2],$A[4][3],[$C[0],#16*11] | 
 | 	str	$A[4][4],[$C[0],#16*12] | 
 |  | 
 | 	ldp	x19,x20,[x29,#16] | 
 | 	add	sp,sp,#48 | 
 | 	ldp	x21,x22,[x29,#32] | 
 | 	ldp	x23,x24,[x29,#48] | 
 | 	ldp	x25,x26,[x29,#64] | 
 | 	ldp	x27,x28,[x29,#80] | 
 | 	ldp	x29,x30,[sp],#128 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	KeccakF1600,.-KeccakF1600 | 
 |  | 
 | .globl	SHA3_absorb | 
 | .type	SHA3_absorb,%function | 
 | .align	5 | 
 | SHA3_absorb: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-128]! | 
 | 	add	x29,sp,#0 | 
 | 	stp	x19,x20,[sp,#16] | 
 | 	stp	x21,x22,[sp,#32] | 
 | 	stp	x23,x24,[sp,#48] | 
 | 	stp	x25,x26,[sp,#64] | 
 | 	stp	x27,x28,[sp,#80] | 
 | 	sub	sp,sp,#64 | 
 |  | 
 | 	stp	x0,x1,[sp,#32]			// offload arguments | 
 | 	stp	x2,x3,[sp,#48] | 
 |  | 
 | 	mov	$C[0],x0			// uint64_t A[5][5] | 
 | 	mov	$C[1],x1			// const void *inp | 
 | 	mov	$C[2],x2			// size_t len | 
 | 	mov	$C[3],x3			// size_t bsz | 
 | 	ldp	$A[0][0],$A[0][1],[$C[0],#16*0] | 
 | 	ldp	$A[0][2],$A[0][3],[$C[0],#16*1] | 
 | 	ldp	$A[0][4],$A[1][0],[$C[0],#16*2] | 
 | 	ldp	$A[1][1],$A[1][2],[$C[0],#16*3] | 
 | 	ldp	$A[1][3],$A[1][4],[$C[0],#16*4] | 
 | 	ldp	$A[2][0],$A[2][1],[$C[0],#16*5] | 
 | 	ldp	$A[2][2],$A[2][3],[$C[0],#16*6] | 
 | 	ldp	$A[2][4],$A[3][0],[$C[0],#16*7] | 
 | 	ldp	$A[3][1],$A[3][2],[$C[0],#16*8] | 
 | 	ldp	$A[3][3],$A[3][4],[$C[0],#16*9] | 
 | 	ldp	$A[4][0],$A[4][1],[$C[0],#16*10] | 
 | 	ldp	$A[4][2],$A[4][3],[$C[0],#16*11] | 
 | 	ldr	$A[4][4],[$C[0],#16*12] | 
 | 	b	.Loop_absorb | 
 |  | 
 | .align	4 | 
 | .Loop_absorb: | 
 | 	subs	$C[0],$C[2],$C[3]		// len - bsz | 
 | 	blo	.Labsorbed | 
 |  | 
 | 	str	$C[0],[sp,#48]			// save len - bsz | 
 | ___ | 
 | for (my $i=0; $i<24; $i+=2) { | 
 | my $j = $i+1; | 
 | $code.=<<___; | 
 | 	ldr	$C[0],[$C[1]],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev	$C[0],$C[0] | 
 | #endif | 
 | 	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] | 
 | 	cmp	$C[3],#8*($i+2) | 
 | 	blo	.Lprocess_block | 
 | 	ldr	$C[0],[$C[1]],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev	$C[0],$C[0] | 
 | #endif | 
 | 	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] | 
 | 	beq	.Lprocess_block | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	ldr	$C[0],[$C[1]],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev	$C[0],$C[0] | 
 | #endif | 
 | 	eor	$A[4][4],$A[4][4],$C[0] | 
 |  | 
 | .Lprocess_block: | 
 | 	str	$C[1],[sp,#40]			// save inp | 
 |  | 
 | 	bl	KeccakF1600_int | 
 |  | 
 | 	ldr	$C[1],[sp,#40]			// restore arguments | 
 | 	ldp	$C[2],$C[3],[sp,#48] | 
 | 	b	.Loop_absorb | 
 |  | 
 | .align	4 | 
 | .Labsorbed: | 
 | 	ldr	$C[1],[sp,#32] | 
 | 	stp	$A[0][0],$A[0][1],[$C[1],#16*0] | 
 | 	stp	$A[0][2],$A[0][3],[$C[1],#16*1] | 
 | 	stp	$A[0][4],$A[1][0],[$C[1],#16*2] | 
 | 	stp	$A[1][1],$A[1][2],[$C[1],#16*3] | 
 | 	stp	$A[1][3],$A[1][4],[$C[1],#16*4] | 
 | 	stp	$A[2][0],$A[2][1],[$C[1],#16*5] | 
 | 	stp	$A[2][2],$A[2][3],[$C[1],#16*6] | 
 | 	stp	$A[2][4],$A[3][0],[$C[1],#16*7] | 
 | 	stp	$A[3][1],$A[3][2],[$C[1],#16*8] | 
 | 	stp	$A[3][3],$A[3][4],[$C[1],#16*9] | 
 | 	stp	$A[4][0],$A[4][1],[$C[1],#16*10] | 
 | 	stp	$A[4][2],$A[4][3],[$C[1],#16*11] | 
 | 	str	$A[4][4],[$C[1],#16*12] | 
 |  | 
 | 	mov	x0,$C[2]			// return value | 
 | 	ldp	x19,x20,[x29,#16] | 
 | 	add	sp,sp,#64 | 
 | 	ldp	x21,x22,[x29,#32] | 
 | 	ldp	x23,x24,[x29,#48] | 
 | 	ldp	x25,x26,[x29,#64] | 
 | 	ldp	x27,x28,[x29,#80] | 
 | 	ldp	x29,x30,[sp],#128 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	SHA3_absorb,.-SHA3_absorb | 
 | ___ | 
 | { | 
 | my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); | 
 | $code.=<<___; | 
 | .globl	SHA3_squeeze | 
 | .type	SHA3_squeeze,%function | 
 | .align	5 | 
 | SHA3_squeeze: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-48]! | 
 | 	add	x29,sp,#0 | 
 | 	stp	x19,x20,[sp,#16] | 
 | 	stp	x21,x22,[sp,#32] | 
 |  | 
 | 	mov	$A_flat,x0			// put aside arguments | 
 | 	mov	$out,x1 | 
 | 	mov	$len,x2 | 
 | 	mov	$bsz,x3 | 
 |  | 
 | .Loop_squeeze: | 
 | 	ldr	x4,[x0],#8 | 
 | 	cmp	$len,#8 | 
 | 	blo	.Lsqueeze_tail | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev	x4,x4 | 
 | #endif | 
 | 	str	x4,[$out],#8 | 
 | 	subs	$len,$len,#8 | 
 | 	beq	.Lsqueeze_done | 
 |  | 
 | 	subs	x3,x3,#8 | 
 | 	bhi	.Loop_squeeze | 
 |  | 
 | 	mov	x0,$A_flat | 
 | 	bl	KeccakF1600 | 
 | 	mov	x0,$A_flat | 
 | 	mov	x3,$bsz | 
 | 	b	.Loop_squeeze | 
 |  | 
 | .align	4 | 
 | .Lsqueeze_tail: | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done | 
 | 	strb	w4,[$out],#1 | 
 |  | 
 | .Lsqueeze_done: | 
 | 	ldp	x19,x20,[sp,#16] | 
 | 	ldp	x21,x22,[sp,#32] | 
 | 	ldp	x29,x30,[sp],#48 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	SHA3_squeeze,.-SHA3_squeeze | 
 | ___ | 
 | }								}}} | 
 | 								{{{ | 
 | my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", | 
 |                              "v".($_+3).".16b", "v".($_+4).".16b" ], | 
 |             (0, 5, 10, 15, 20)); | 
 |  | 
 | my @C = map("v$_.16b", (25..31)); | 
 | my @D = @C[4,5,6,2,3]; | 
 |  | 
 | $code.=<<___; | 
 | .type	KeccakF1600_ce,%function | 
 | .align	5 | 
 | KeccakF1600_ce: | 
 | 	mov	x9,#24 | 
 | 	adr	x10,iotas | 
 | 	b	.Loop_ce | 
 | .align	4 | 
 | .Loop_ce: | 
 | 	////////////////////////////////////////////////// Theta | 
 | 	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0] | 
 | 	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1] | 
 | 	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2] | 
 | 	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3] | 
 | 	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4] | 
 | 	eor3	$C[0],$C[0],   $A[1][0],$A[0][0] | 
 | 	eor3	$C[1],$C[1],   $A[1][1],$A[0][1] | 
 | 	eor3	$C[2],$C[2],   $A[1][2],$A[0][2] | 
 | 	eor3	$C[3],$C[3],   $A[1][3],$A[0][3] | 
 | 	eor3	$C[4],$C[4],   $A[1][4],$A[0][4] | 
 |  | 
 | 	rax1	$C[5],$C[0],$C[2]			// D[1] | 
 | 	rax1	$C[6],$C[1],$C[3]			// D[2] | 
 | 	rax1	$C[2],$C[2],$C[4]			// D[3] | 
 | 	rax1	$C[3],$C[3],$C[0]			// D[4] | 
 | 	rax1	$C[4],$C[4],$C[1]			// D[0] | 
 |  | 
 | 	////////////////////////////////////////////////// Theta+Rho+Pi | 
 | 	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] | 
 |  | 
 | 	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] | 
 | 	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] | 
 | 	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] | 
 | 	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] | 
 | 	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] | 
 |  | 
 | 	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] | 
 |  | 
 | 	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] | 
 | 	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] | 
 | 	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] | 
 | 	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] | 
 | 	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] | 
 |  | 
 | 	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] | 
 |  | 
 | 	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] | 
 | 	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] | 
 | 	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] | 
 | 	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] | 
 | 	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] | 
 |  | 
 | 	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] | 
 |  | 
 | 	eor	$A[0][0],$A[0][0],$D[0] | 
 |  | 
 | 	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] | 
 | 	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] | 
 | 	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] | 
 | 	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] | 
 | 	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] | 
 |  | 
 | 	////////////////////////////////////////////////// Chi+Iota | 
 | 	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1] | 
 | 	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1] | 
 | 	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3] | 
 | 	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4] | 
 | 	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1] | 
 |  | 
 | 	ld1r	{$C[1]},[x10],#8 | 
 |  | 
 | 	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3] | 
 | 	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3] | 
 | 	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0] | 
 | 	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1] | 
 | 	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3] | 
 |  | 
 | 	bcax	$A[2][0],$C[0],   $A[2][2],$D[2] | 
 | 	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2] | 
 | 	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3] | 
 | 	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4] | 
 | 	bcax	$A[2][4],$A[2][4],$D[2],   $C[0] | 
 |  | 
 | 	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3] | 
 | 	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3] | 
 | 	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0] | 
 | 	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1] | 
 | 	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3] | 
 |  | 
 | 	bcax	$A[0][3],$D[3],   $A[0][0],$D[4] | 
 | 	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0] | 
 | 	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1] | 
 | 	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2] | 
 | 	bcax	$A[0][2],$A[0][2],$D[4],   $D[3] | 
 |  | 
 | 	eor	$A[0][0],$A[0][0],$C[1] | 
 |  | 
 | 	subs	x9,x9,#1 | 
 | 	bne	.Loop_ce | 
 |  | 
 | 	ret | 
 | .size	KeccakF1600_ce,.-KeccakF1600_ce | 
 |  | 
 | .type	KeccakF1600_cext,%function | 
 | .align	5 | 
 | KeccakF1600_cext: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-80]! | 
 | 	add	x29,sp,#0 | 
 | 	stp	d8,d9,[sp,#16]		// per ABI requirement | 
 | 	stp	d10,d11,[sp,#32] | 
 | 	stp	d12,d13,[sp,#48] | 
 | 	stp	d14,d15,[sp,#64] | 
 | ___ | 
 | for($i=0; $i<24; $i+=2) {		# load A[5][5] | 
 | my $j=$i+1; | 
 | $code.=<<___; | 
 | 	ldp	d$i,d$j,[x0,#8*$i] | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	ldr	d24,[x0,#8*$i] | 
 | 	bl	KeccakF1600_ce | 
 | 	ldr	x30,[sp,#8] | 
 | ___ | 
 | for($i=0; $i<24; $i+=2) {		# store A[5][5] | 
 | my $j=$i+1; | 
 | $code.=<<___; | 
 | 	stp	d$i,d$j,[x0,#8*$i] | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	str	d24,[x0,#8*$i] | 
 |  | 
 | 	ldp	d8,d9,[sp,#16] | 
 | 	ldp	d10,d11,[sp,#32] | 
 | 	ldp	d12,d13,[sp,#48] | 
 | 	ldp	d14,d15,[sp,#64] | 
 | 	ldr	x29,[sp],#80 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	KeccakF1600_cext,.-KeccakF1600_cext | 
 | ___ | 
 |  | 
 | { | 
 | my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); | 
 |  | 
 | $code.=<<___; | 
 | .globl	SHA3_absorb_cext | 
 | .type	SHA3_absorb_cext,%function | 
 | .align	5 | 
 | SHA3_absorb_cext: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-80]! | 
 | 	add	x29,sp,#0 | 
 | 	stp	d8,d9,[sp,#16]		// per ABI requirement | 
 | 	stp	d10,d11,[sp,#32] | 
 | 	stp	d12,d13,[sp,#48] | 
 | 	stp	d14,d15,[sp,#64] | 
 | ___ | 
 | for($i=0; $i<24; $i+=2) {		# load A[5][5] | 
 | my $j=$i+1; | 
 | $code.=<<___; | 
 | 	ldp	d$i,d$j,[x0,#8*$i] | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	ldr	d24,[x0,#8*$i] | 
 | 	b	.Loop_absorb_ce | 
 |  | 
 | .align	4 | 
 | .Loop_absorb_ce: | 
 | 	subs	$len,$len,$bsz		// len - bsz | 
 | 	blo	.Labsorbed_ce | 
 | ___ | 
 | for (my $i=0; $i<24; $i+=2) { | 
 | my $j = $i+1; | 
 | $code.=<<___; | 
 | 	ldr	d31,[$inp],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev64	v31.16b,v31.16b | 
 | #endif | 
 | 	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b | 
 | 	cmp	$bsz,#8*($i+2) | 
 | 	blo	.Lprocess_block_ce | 
 | 	ldr	d31,[$inp],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev64	v31.16b,v31.16b | 
 | #endif | 
 | 	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b | 
 | 	beq	.Lprocess_block_ce | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	ldr	d31,[$inp],#8		// *inp++ | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev64	v31.16b,v31.16b | 
 | #endif | 
 | 	eor	$A[4][4],$A[4][4],v31.16b | 
 |  | 
 | .Lprocess_block_ce: | 
 |  | 
 | 	bl	KeccakF1600_ce | 
 |  | 
 | 	b	.Loop_absorb_ce | 
 |  | 
 | .align	4 | 
 | .Labsorbed_ce: | 
 | ___ | 
 | for($i=0; $i<24; $i+=2) {		# store A[5][5] | 
 | my $j=$i+1; | 
 | $code.=<<___; | 
 | 	stp	d$i,d$j,[x0,#8*$i] | 
 | ___ | 
 | } | 
 | $code.=<<___; | 
 | 	str	d24,[x0,#8*$i] | 
 | 	add	x0,$len,$bsz		// return value | 
 |  | 
 | 	ldp	d8,d9,[sp,#16] | 
 | 	ldp	d10,d11,[sp,#32] | 
 | 	ldp	d12,d13,[sp,#48] | 
 | 	ldp	d14,d15,[sp,#64] | 
 | 	ldp	x29,x30,[sp],#80 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	SHA3_absorb_cext,.-SHA3_absorb_cext | 
 | ___ | 
 | } | 
 | { | 
 | my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); | 
 | $code.=<<___; | 
 | .globl	SHA3_squeeze_cext | 
 | .type	SHA3_squeeze_cext,%function | 
 | .align	5 | 
 | SHA3_squeeze_cext: | 
 | 	AARCH64_SIGN_LINK_REGISTER | 
 | 	stp	x29,x30,[sp,#-16]! | 
 | 	add	x29,sp,#0 | 
 | 	mov	x9,$ctx | 
 | 	mov	x10,$bsz | 
 |  | 
 | .Loop_squeeze_ce: | 
 | 	ldr	x4,[x9],#8 | 
 | 	cmp	$len,#8 | 
 | 	blo	.Lsqueeze_tail_ce | 
 | #ifdef	__AARCH64EB__ | 
 | 	rev	x4,x4 | 
 | #endif | 
 | 	str	x4,[$out],#8 | 
 | 	beq	.Lsqueeze_done_ce | 
 |  | 
 | 	sub	$len,$len,#8 | 
 | 	subs	x10,x10,#8 | 
 | 	bhi	.Loop_squeeze_ce | 
 |  | 
 | 	bl	KeccakF1600_cext | 
 | 	ldr	x30,[sp,#8] | 
 | 	mov	x9,$ctx | 
 | 	mov	x10,$bsz | 
 | 	b	.Loop_squeeze_ce | 
 |  | 
 | .align	4 | 
 | .Lsqueeze_tail_ce: | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 | 	lsr	x4,x4,#8 | 
 | 	subs	$len,$len,#1 | 
 | 	beq	.Lsqueeze_done_ce | 
 | 	strb	w4,[$out],#1 | 
 |  | 
 | .Lsqueeze_done_ce: | 
 | 	ldr	x29,[sp],#16 | 
 | 	AARCH64_VALIDATE_LINK_REGISTER | 
 | 	ret | 
 | .size	SHA3_squeeze_cext,.-SHA3_squeeze_cext | 
 | ___ | 
 | }								}}} | 
 | $code.=<<___; | 
 | .asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" | 
 | ___ | 
 |  | 
 | {   my  %opcode = ( | 
 | 	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000, | 
 | 	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	); | 
 |  | 
 |     sub unsha3 { | 
 | 	my ($mnemonic,$arg)=@_; | 
 |  | 
 | 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ | 
 | 	&& | 
 | 	sprintf ".inst\t0x%08x\t//%s %s", | 
 | 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), | 
 | 			$mnemonic,$arg; | 
 |     } | 
 | } | 
 |  | 
 | foreach(split("\n",$code)) { | 
 |  | 
 | 	s/\`([^\`]*)\`/eval($1)/ge; | 
 |  | 
 | 	m/\bld1r\b/ and s/\.16b/.2d/g	or | 
 | 	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; | 
 |  | 
 | 	print $_,"\n"; | 
 | } | 
 |  | 
 | close STDOUT or die "error closing STDOUT: $!"; |