crypto/aes/asm/aesni-sha256-x86_64.pl - third_party/openssl - Git at Google

 #! /usr/bin/env perl
 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html

 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # January 2013
 #
 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
 # parallelism, interleaving it with another algorithm would allow to
 # utilize processor resources better and achieve better performance.
 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
 # AESNI code is weaved into it. As SHA256 dominates execution time,
 # stitch performance does not depend on AES key length. Below are
 # performance numbers in cycles per processed byte, less is better,
 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
 # subroutine:
 #
 #		 AES-128/-192/-256+SHA256   this(**)	gain
 # Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
 # Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
 # Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
 # Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
 # Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
 # Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
 # Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
 #
 # (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
 #	Westmere is omitted from loop, this is because gain was not
 #	estimated high enough to justify the effort;
 # (**)	these are EVP-free results, results obtained with 'speed
 #	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
 # (***)	these are SHAEXT results;

 # $output is the last argument if it looks like a file (it has an extension)
 # $flavour is the first argument if it doesn't look like a file
 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";

 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
 	$avx = ($1>=2.19) + ($1>=2.22);
 }

 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
 	$avx = ($1>=2.09) + ($1>=2.10);
 }

 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
 	$avx = ($1>=10) + ($1>=12);
 }

 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
 	$avx = ($2>=3.0) + ($2>3.0);
 }

 $shaext=$avx;	### set to zero if compiling for 1.0.1
 $avx=1		if (!$shaext && $avx);

 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
     or die "can't call $xlate: $!";
 *STDOUT=*OUT;

 $func="aesni_cbc_sha256_enc";
 $TABLE="K256";
 $SZ=4;
 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
 				"%r8d","%r9d","%r10d","%r11d");
 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
 @Sigma0=( 2,13,22);
 @Sigma1=( 6,11,25);
 @sigma0=( 7,18, 3);
 @sigma1=(17,19,10);
 $rounds=64;

 ########################################################################
 # void aesni_cbc_sha256_enc(const void *inp,
 #			void *out,
 #			size_t length,
 #			const AES_KEY *key,
 #			unsigned char *iv,
 #			SHA256_CTX *ctx,
 #			const void *in0);
 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");

 $Tbl="%rbp";

 $_inp="16*$SZ+0*8(%rsp)";
 $_out="16*$SZ+1*8(%rsp)";
 $_end="16*$SZ+2*8(%rsp)";
 $_key="16*$SZ+3*8(%rsp)";
 $_ivp="16*$SZ+4*8(%rsp)";
 $_ctx="16*$SZ+5*8(%rsp)";
 $_in0="16*$SZ+6*8(%rsp)";
 $_rsp="`16*$SZ+7*8`(%rsp)";
 $framesz=16*$SZ+8*8;

 $code=<<___;
 .text

 .extern	OPENSSL_ia32cap_P
 .globl	$func
 .type	$func,\@abi-omnipotent
 .align	16
 $func:
 .cfi_startproc
 ___
 						if ($avx) {
 $code.=<<___;
 	lea	OPENSSL_ia32cap_P(%rip),%r11
 	mov	\$1,%eax
 	cmp	\$0,`$win64?"%rcx":"%rdi"`
 	je	.Lprobe
 	mov	0(%r11),%eax
 	mov	4(%r11),%r10
 ___
 $code.=<<___ if ($shaext);
 	bt	\$61,%r10			# check for SHA
 	jc	${func}_shaext
 ___
 $code.=<<___;
 	mov	%r10,%r11
 	shr	\$32,%r11

 	test	\$`1<<11`,%r10d			# check for XOP
 	jnz	${func}_xop
 ___
 $code.=<<___ if ($avx>1);
 	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
 	je	${func}_avx2
 ___
 $code.=<<___;
 	and	\$`1<<28`,%r10d			# check for AVX
 	jnz	${func}_avx
 	ud2
 ___
 						}
 $code.=<<___;
 	xor	%eax,%eax
 	cmp	\$0,`$win64?"%rcx":"%rdi"`
 	je	.Lprobe
 	ud2
 .Lprobe:
 	ret
 .cfi_endproc
 .size	$func,.-$func

 .align	64
 .type	$TABLE,\@object
 $TABLE:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 	.long	0,0,0,0,   0,0,0,0
 	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___

 ######################################################################
 # SIMD code paths
 #
 {{{
 ($iv,$inout,$roundkey,$temp,
  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));

 $aesni_cbc_idx=0;
 @aesni_cbc_block = (
 ##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
 ##	&vmovdqu	($inout,($inp));
 ##	&mov		($_inp,$inp);

 	'&vpxor		($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',

 	'&vpxor		($inout,$inout,$iv);',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',

 	'&vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',

 	'&vaesenclast	($temp,$inout,$roundkey);'.
 	' &vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',

 	'&vpand		($iv,$temp,$mask10);'.
 	' &vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',

 	'&vaesenclast	($temp,$inout,$roundkey);'.
 	' &vaesenc	($inout,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',

 	'&vpand		($temp,$temp,$mask12);'.
 	' &vaesenc	($inout,$inout,$roundkey);'.
 	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',

 	'&vpor		($iv,$iv,$temp);'.
 	' &vaesenclast	($temp,$inout,$roundkey);'.
 	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'

 ##	&mov		($inp,$_inp);
 ##	&mov		($out,$_out);
 ##	&vpand		($temp,$temp,$mask14);
 ##	&vpor		($iv,$iv,$temp);
 ##	&vmovdqu	($iv,($out,$inp);
 ##	&lea		(inp,16($inp));
 );

 my $a4=$T1;
 my ($a,$b,$c,$d,$e,$f,$g,$h);

 sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
   my $arg = pop;
     $arg = "\$$arg" if ($arg*1 eq $arg);
     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 }

 sub body_00_15 () {
 	(
 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.

 	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
 	'&mov	($a,$a1)',
 	'&mov	($a4,$f)',

 	'&xor	($a0,$e)',
 	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
 	'&xor	($a4,$g)',			# f^g

 	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
 	'&xor	($a1,$a)',
 	'&and	($a4,$e)',			# (f^g)&e

 	@aesni_cbc_block[$aesni_cbc_idx++].
 	'&xor	($a0,$e)',
 	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
 	'&mov	($a2,$a)',

 	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
 	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
 	'&xor	($a2,$b)',			# a^b, b^c in next round

 	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
 	'&add	($h,$a4)',			# h+=Ch(e,f,g)
 	'&and	($a3,$a2)',			# (b^c)&(a^b)

 	'&xor	($a1,$a)',
 	'&add	($h,$a0)',			# h+=Sigma1(e)
 	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)

 	'&add	($d,$h)',			# d+=h
 	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
 	'&add	($h,$a3)',			# h+=Maj(a,b,c)

 	'&mov	($a0,$d)',
 	'&add	($a1,$h);'.			# h+=Sigma0(a)
 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 	);
 }

 if ($avx) {{
 ######################################################################
 # XOP code path
 #
 $code.=<<___;
 .type	${func}_xop,\@function,6
 .align	64
 ${func}_xop:
 .cfi_startproc
 .Lxop_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
 	push	%r12
 .cfi_push	%r12
 	push	%r13
 .cfi_push	%r13
 	push	%r14
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
 	sub	\$`$framesz+$win64*16*10`,%rsp
 	and	\$-64,%rsp		# align stack frame

 	shl	\$6,$len
 	sub	$inp,$out		# re-bias
 	sub	$inp,$in0
 	add	$inp,$len		# end of input

 	#mov	$inp,$_inp		# saved later
 	mov	$out,$_out
 	mov	$len,$_end
 	#mov	$key,$_key		# remains resident in $inp register
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
 	mov	%rax,$_rsp
 .cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
 	movaps	%xmm7,`$framesz+16*1`(%rsp)
 	movaps	%xmm8,`$framesz+16*2`(%rsp)
 	movaps	%xmm9,`$framesz+16*3`(%rsp)
 	movaps	%xmm10,`$framesz+16*4`(%rsp)
 	movaps	%xmm11,`$framesz+16*5`(%rsp)
 	movaps	%xmm12,`$framesz+16*6`(%rsp)
 	movaps	%xmm13,`$framesz+16*7`(%rsp)
 	movaps	%xmm14,`$framesz+16*8`(%rsp)
 	movaps	%xmm15,`$framesz+16*9`(%rsp)
 ___
 $code.=<<___;
 .Lprologue_xop:
 	vzeroall

 	mov	$inp,%r12		# borrow $a4
 	lea	0x80($key),$inp		# size optimization, reassign
 	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
 	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
 	mov	$ctx,%r15		# borrow $a2
 	mov	$in0,%rsi		# borrow $a3
 	vmovdqu	($ivp),$iv		# load IV
 	sub	\$9,%r14

 	mov	$SZ*0(%r15),$A
 	mov	$SZ*1(%r15),$B
 	mov	$SZ*2(%r15),$C
 	mov	$SZ*3(%r15),$D
 	mov	$SZ*4(%r15),$E
 	mov	$SZ*5(%r15),$F
 	mov	$SZ*6(%r15),$G
 	mov	$SZ*7(%r15),$H

 	vmovdqa	0x00(%r13,%r14,8),$mask14
 	vmovdqa	0x10(%r13,%r14,8),$mask12
 	vmovdqa	0x20(%r13,%r14,8),$mask10
 	vmovdqu	0x00-0x80($inp),$roundkey
 	jmp	.Lloop_xop
 ___
 					if ($SZ==4) {	# SHA256
     my @X = map("%xmm$_",(0..3));
     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));

 $code.=<<___;
 .align	16
 .Lloop_xop:
 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
 	vmovdqu	0x00(%rsi,%r12),@X[0]
 	vmovdqu	0x10(%rsi,%r12),@X[1]
 	vmovdqu	0x20(%rsi,%r12),@X[2]
 	vmovdqu	0x30(%rsi,%r12),@X[3]
 	vpshufb	$t3,@X[0],@X[0]
 	lea	$TABLE(%rip),$Tbl
 	vpshufb	$t3,@X[1],@X[1]
 	vpshufb	$t3,@X[2],@X[2]
 	vpaddd	0x00($Tbl),@X[0],$t0
 	vpshufb	$t3,@X[3],@X[3]
 	vpaddd	0x20($Tbl),@X[1],$t1
 	vpaddd	0x40($Tbl),@X[2],$t2
 	vpaddd	0x60($Tbl),@X[3],$t3
 	vmovdqa	$t0,0x00(%rsp)
 	mov	$A,$a1
 	vmovdqa	$t1,0x10(%rsp)
 	mov	$B,$a3
 	vmovdqa	$t2,0x20(%rsp)
 	xor	$C,$a3			# magic
 	vmovdqa	$t3,0x30(%rsp)
 	mov	$E,$a0
 	jmp	.Lxop_00_47

 .align	16
 .Lxop_00_47:
 	sub	\$-16*2*$SZ,$Tbl	# size optimization
 	vmovdqu	(%r12),$inout		# $a4
 	mov	%r12,$_inp		# $a4
 ___
 sub XOP_256_00_47 () {
 my $j = shift;
 my $body = shift;
 my @X = @_;
 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions

 	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpsrld		($t0,$t0,$sigma0[2]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpxor		($t0,$t0,$t1);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpsrld	($t2,@X[3],$sigma1[2]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpxor		($t3,$t3,$t2);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpsrldq	($t3,$t3,8);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpsrld	($t2,@X[0],$sigma1[2]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpxor		($t3,$t3,$t2);
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpslldq	($t3,$t3,8);		# 22 instructions
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	  eval(shift(@insns));
 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
 	  foreach (@insns) { eval; }		# remaining instructions
 	&vmovdqa	(16*$j."(%rsp)",$t2);
 }

     $aesni_cbc_idx=0;
     for ($i=0,$j=0; $j<4; $j++) {
 	&XOP_256_00_47($j,\&body_00_15,@X);
 	push(@X,shift(@X));			# rotate(@X)
     }
     	&mov		("%r12",$_inp);		# borrow $a4
 	&vpand		($temp,$temp,$mask14);
 	&mov		("%r15",$_out);		# borrow $a2
 	&vpor		($iv,$iv,$temp);
 	&vmovdqu	("(%r15,%r12)",$iv);	# write output
 	&lea		("%r12","16(%r12)");	# inp++

 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
 	&jne	(".Lxop_00_47");

 	&vmovdqu	($inout,"(%r12)");
 	&mov		($_inp,"%r12");

     $aesni_cbc_idx=0;
     for ($i=0; $i<16; ) {
 	foreach(body_00_15()) { eval; }
     }
 					}
 $code.=<<___;
 	mov	$_inp,%r12		# borrow $a4
 	mov	$_out,%r13		# borrow $a0
 	mov	$_ctx,%r15		# borrow $a2
 	mov	$_in0,%rsi		# borrow $a3

 	vpand	$mask14,$temp,$temp
 	mov	$a1,$A
 	vpor	$temp,$iv,$iv
 	vmovdqu	$iv,(%r13,%r12)		# write output
 	lea	16(%r12),%r12		# inp++

 	add	$SZ*0(%r15),$A
 	add	$SZ*1(%r15),$B
 	add	$SZ*2(%r15),$C
 	add	$SZ*3(%r15),$D
 	add	$SZ*4(%r15),$E
 	add	$SZ*5(%r15),$F
 	add	$SZ*6(%r15),$G
 	add	$SZ*7(%r15),$H

 	cmp	$_end,%r12

 	mov	$A,$SZ*0(%r15)
 	mov	$B,$SZ*1(%r15)
 	mov	$C,$SZ*2(%r15)
 	mov	$D,$SZ*3(%r15)
 	mov	$E,$SZ*4(%r15)
 	mov	$F,$SZ*5(%r15)
 	mov	$G,$SZ*6(%r15)
 	mov	$H,$SZ*7(%r15)

 	jb	.Lloop_xop

 	mov	$_ivp,$ivp
 	mov	$_rsp,%rsi
 .cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
 $code.=<<___ if ($win64);
 	movaps	`$framesz+16*0`(%rsp),%xmm6
 	movaps	`$framesz+16*1`(%rsp),%xmm7
 	movaps	`$framesz+16*2`(%rsp),%xmm8
 	movaps	`$framesz+16*3`(%rsp),%xmm9
 	movaps	`$framesz+16*4`(%rsp),%xmm10
 	movaps	`$framesz+16*5`(%rsp),%xmm11
 	movaps	`$framesz+16*6`(%rsp),%xmm12
 	movaps	`$framesz+16*7`(%rsp),%xmm13
 	movaps	`$framesz+16*8`(%rsp),%xmm14
 	movaps	`$framesz+16*9`(%rsp),%xmm15
 ___
 $code.=<<___;
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
 .cfi_restore	%r14
 	mov	-32(%rsi),%r13
 .cfi_restore	%r13
 	mov	-24(%rsi),%r12
 .cfi_restore	%r12
 	mov	-16(%rsi),%rbp
 .cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
 .cfi_restore	%rbx
 	lea	(%rsi),%rsp
 .cfi_def_cfa_register	%rsp
 .Lepilogue_xop:
 	ret
 .cfi_endproc
 .size	${func}_xop,.-${func}_xop
 ___
 ######################################################################
 # AVX+shrd code path
 #
 local *ror = sub { &shrd(@_[0],@_) };

 $code.=<<___;
 .type	${func}_avx,\@function,6
 .align	64
 ${func}_avx:
 .cfi_startproc
 .Lavx_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
 	push	%r12
 .cfi_push	%r12
 	push	%r13
 .cfi_push	%r13
 	push	%r14
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
 	sub	\$`$framesz+$win64*16*10`,%rsp
 	and	\$-64,%rsp		# align stack frame

 	shl	\$6,$len
 	sub	$inp,$out		# re-bias
 	sub	$inp,$in0
 	add	$inp,$len		# end of input

 	#mov	$inp,$_inp		# saved later
 	mov	$out,$_out
 	mov	$len,$_end
 	#mov	$key,$_key		# remains resident in $inp register
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
 	mov	%rax,$_rsp
 .cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
 	movaps	%xmm7,`$framesz+16*1`(%rsp)
 	movaps	%xmm8,`$framesz+16*2`(%rsp)
 	movaps	%xmm9,`$framesz+16*3`(%rsp)
 	movaps	%xmm10,`$framesz+16*4`(%rsp)
 	movaps	%xmm11,`$framesz+16*5`(%rsp)
 	movaps	%xmm12,`$framesz+16*6`(%rsp)
 	movaps	%xmm13,`$framesz+16*7`(%rsp)
 	movaps	%xmm14,`$framesz+16*8`(%rsp)
 	movaps	%xmm15,`$framesz+16*9`(%rsp)
 ___
 $code.=<<___;
 .Lprologue_avx:
 	vzeroall

 	mov	$inp,%r12		# borrow $a4
 	lea	0x80($key),$inp		# size optimization, reassign
 	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
 	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
 	mov	$ctx,%r15		# borrow $a2
 	mov	$in0,%rsi		# borrow $a3
 	vmovdqu	($ivp),$iv		# load IV
 	sub	\$9,%r14

 	mov	$SZ*0(%r15),$A
 	mov	$SZ*1(%r15),$B
 	mov	$SZ*2(%r15),$C
 	mov	$SZ*3(%r15),$D
 	mov	$SZ*4(%r15),$E
 	mov	$SZ*5(%r15),$F
 	mov	$SZ*6(%r15),$G
 	mov	$SZ*7(%r15),$H

 	vmovdqa	0x00(%r13,%r14,8),$mask14
 	vmovdqa	0x10(%r13,%r14,8),$mask12
 	vmovdqa	0x20(%r13,%r14,8),$mask10
 	vmovdqu	0x00-0x80($inp),$roundkey
 ___
 					if ($SZ==4) {	# SHA256
     my @X = map("%xmm$_",(0..3));
     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));

 $code.=<<___;
 	jmp	.Lloop_avx
 .align	16
 .Lloop_avx:
 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
 	vmovdqu	0x00(%rsi,%r12),@X[0]
 	vmovdqu	0x10(%rsi,%r12),@X[1]
 	vmovdqu	0x20(%rsi,%r12),@X[2]
 	vmovdqu	0x30(%rsi,%r12),@X[3]
 	vpshufb	$t3,@X[0],@X[0]
 	lea	$TABLE(%rip),$Tbl
 	vpshufb	$t3,@X[1],@X[1]
 	vpshufb	$t3,@X[2],@X[2]
 	vpaddd	0x00($Tbl),@X[0],$t0
 	vpshufb	$t3,@X[3],@X[3]
 	vpaddd	0x20($Tbl),@X[1],$t1
 	vpaddd	0x40($Tbl),@X[2],$t2
 	vpaddd	0x60($Tbl),@X[3],$t3
 	vmovdqa	$t0,0x00(%rsp)
 	mov	$A,$a1
 	vmovdqa	$t1,0x10(%rsp)
 	mov	$B,$a3
 	vmovdqa	$t2,0x20(%rsp)
 	xor	$C,$a3			# magic
 	vmovdqa	$t3,0x30(%rsp)
 	mov	$E,$a0
 	jmp	.Lavx_00_47

 .align	16
 .Lavx_00_47:
 	sub	\$-16*2*$SZ,$Tbl	# size optimization
 	vmovdqu	(%r12),$inout		# $a4
 	mov	%r12,$_inp		# $a4
 ___
 sub Xupdate_256_AVX () {
 	(
 	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
 	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
 	'&vpsrld	($t2,$t0,$sigma0[0]);',
 	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
 	'&vpsrld	($t3,$t0,$sigma0[2])',
 	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
 	'&vpxor		($t0,$t3,$t2)',
 	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
 	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
 	'&vpxor		($t0,$t0,$t1)',
 	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
 	'&vpxor		($t0,$t0,$t2)',
 	 '&vpsrld	($t2,$t3,$sigma1[2]);',
 	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
 	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
 	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
 	 '&vpxor	($t2,$t2,$t3);',
 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
 	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
 	 '&vpshufd	($t2,$t2,0b10000100)',
 	 '&vpsrldq	($t2,$t2,8)',
 	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
 	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
 	 '&vpsrld	($t2,$t3,$sigma1[2])',
 	 '&vpsrlq	($t3,$t3,$sigma1[0])',
 	 '&vpxor	($t2,$t2,$t3);',
 	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
 	 '&vpxor	($t2,$t2,$t3)',
 	 '&vpshufd	($t2,$t2,0b11101000)',
 	 '&vpslldq	($t2,$t2,8)',
 	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
 	);
 }

 sub AVX_256_00_47 () {
 my $j = shift;
 my $body = shift;
 my @X = @_;
 my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions

 	foreach (Xupdate_256_AVX()) {		# 29 instructions
 	    eval;
 	    eval(shift(@insns));
 	    eval(shift(@insns));
 	    eval(shift(@insns));
 	}
 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
 	  foreach (@insns) { eval; }		# remaining instructions
 	&vmovdqa	(16*$j."(%rsp)",$t2);
 }

     $aesni_cbc_idx=0;
     for ($i=0,$j=0; $j<4; $j++) {
 	&AVX_256_00_47($j,\&body_00_15,@X);
 	push(@X,shift(@X));			# rotate(@X)
     }
     	&mov		("%r12",$_inp);		# borrow $a4
 	&vpand		($temp,$temp,$mask14);
 	&mov		("%r15",$_out);		# borrow $a2
 	&vpor		($iv,$iv,$temp);
 	&vmovdqu	("(%r15,%r12)",$iv);	# write output
 	&lea		("%r12","16(%r12)");	# inp++

 	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
 	&jne	(".Lavx_00_47");

 	&vmovdqu	($inout,"(%r12)");
 	&mov		($_inp,"%r12");

     $aesni_cbc_idx=0;
     for ($i=0; $i<16; ) {
 	foreach(body_00_15()) { eval; }
     }

 					}
 $code.=<<___;
 	mov	$_inp,%r12		# borrow $a4
 	mov	$_out,%r13		# borrow $a0
 	mov	$_ctx,%r15		# borrow $a2
 	mov	$_in0,%rsi		# borrow $a3

 	vpand	$mask14,$temp,$temp
 	mov	$a1,$A
 	vpor	$temp,$iv,$iv
 	vmovdqu	$iv,(%r13,%r12)		# write output
 	lea	16(%r12),%r12		# inp++

 	add	$SZ*0(%r15),$A
 	add	$SZ*1(%r15),$B
 	add	$SZ*2(%r15),$C
 	add	$SZ*3(%r15),$D
 	add	$SZ*4(%r15),$E
 	add	$SZ*5(%r15),$F
 	add	$SZ*6(%r15),$G
 	add	$SZ*7(%r15),$H

 	cmp	$_end,%r12

 	mov	$A,$SZ*0(%r15)
 	mov	$B,$SZ*1(%r15)
 	mov	$C,$SZ*2(%r15)
 	mov	$D,$SZ*3(%r15)
 	mov	$E,$SZ*4(%r15)
 	mov	$F,$SZ*5(%r15)
 	mov	$G,$SZ*6(%r15)
 	mov	$H,$SZ*7(%r15)
 	jb	.Lloop_avx

 	mov	$_ivp,$ivp
 	mov	$_rsp,%rsi
 .cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
 $code.=<<___ if ($win64);
 	movaps	`$framesz+16*0`(%rsp),%xmm6
 	movaps	`$framesz+16*1`(%rsp),%xmm7
 	movaps	`$framesz+16*2`(%rsp),%xmm8
 	movaps	`$framesz+16*3`(%rsp),%xmm9
 	movaps	`$framesz+16*4`(%rsp),%xmm10
 	movaps	`$framesz+16*5`(%rsp),%xmm11
 	movaps	`$framesz+16*6`(%rsp),%xmm12
 	movaps	`$framesz+16*7`(%rsp),%xmm13
 	movaps	`$framesz+16*8`(%rsp),%xmm14
 	movaps	`$framesz+16*9`(%rsp),%xmm15
 ___
 $code.=<<___;
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
 .cfi_restore	%r14
 	mov	-32(%rsi),%r13
 .cfi_restore	%r13
 	mov	-24(%rsi),%r12
 .cfi_restore	%r12
 	mov	-16(%rsi),%rbp
 .cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
 .cfi_restore	%rbx
 	lea	(%rsi),%rsp
 .cfi_def_cfa_register	%rsp
 .Lepilogue_avx:
 	ret
 .cfi_endproc
 .size	${func}_avx,.-${func}_avx
 ___

 if ($avx>1) {{
 ######################################################################
 # AVX2+BMI code path
 #
 my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
 my $PUSH8=8*2*$SZ;
 use integer;

 sub bodyx_00_15 () {
 	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 	(
 	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.

 	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 	'&and	($a4,$e)',		# f&e
 	'&rorx	($a0,$e,$Sigma1[2])',
 	'&rorx	($a2,$e,$Sigma1[1])',

 	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
 	'&lea	($h,"($h,$a4)")',
 	'&andn	($a4,$e,$g)',		# ~e&g
 	'&xor	($a0,$a2)',

 	'&rorx	($a1,$e,$Sigma1[0])',
 	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
 	'&xor	($a0,$a1)',		# Sigma1(e)
 	'&mov	($a2,$a)',

 	'&rorx	($a4,$a,$Sigma0[2])',
 	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
 	'&xor	($a2,$b)',		# a^b, b^c in next round
 	'&rorx	($a1,$a,$Sigma0[1])',

 	'&rorx	($a0,$a,$Sigma0[0])',
 	'&lea	($d,"($d,$h)")',	# d+=h
 	'&and	($a3,$a2)',		# (b^c)&(a^b)
 	@aesni_cbc_block[$aesni_cbc_idx++].
 	'&xor	($a1,$a4)',

 	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
 	'&xor	($a1,$a0)',		# Sigma0(a)
 	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
 	'&mov	($a4,$e)',		# copy of f in future

 	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 	);
 	# and at the finish one has to $a+=$a1
 }

 $code.=<<___;
 .type	${func}_avx2,\@function,6
 .align	64
 ${func}_avx2:
 .cfi_startproc
 .Lavx2_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
 	push	%r12
 .cfi_push	%r12
 	push	%r13
 .cfi_push	%r13
 	push	%r14
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
 	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 	and	\$-256*$SZ,%rsp		# align stack frame
 	add	\$`2*$SZ*($rounds-8)`,%rsp

 	shl	\$6,$len
 	sub	$inp,$out		# re-bias
 	sub	$inp,$in0
 	add	$inp,$len		# end of input

 	#mov	$inp,$_inp		# saved later
 	#mov	$out,$_out		# kept in $offload
 	mov	$len,$_end
 	#mov	$key,$_key		# remains resident in $inp register
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
 	mov	%rax,$_rsp
 .cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
 	movaps	%xmm7,`$framesz+16*1`(%rsp)
 	movaps	%xmm8,`$framesz+16*2`(%rsp)
 	movaps	%xmm9,`$framesz+16*3`(%rsp)
 	movaps	%xmm10,`$framesz+16*4`(%rsp)
 	movaps	%xmm11,`$framesz+16*5`(%rsp)
 	movaps	%xmm12,`$framesz+16*6`(%rsp)
 	movaps	%xmm13,`$framesz+16*7`(%rsp)
 	movaps	%xmm14,`$framesz+16*8`(%rsp)
 	movaps	%xmm15,`$framesz+16*9`(%rsp)
 ___
 $code.=<<___;
 .Lprologue_avx2:
 	vzeroall

 	mov	$inp,%r13		# borrow $a0
 	vpinsrq	\$1,$out,$offload,$offload
 	lea	0x80($key),$inp		# size optimization, reassign
 	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
 	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
 	mov	$ctx,%r15		# borrow $a2
 	mov	$in0,%rsi		# borrow $a3
 	vmovdqu	($ivp),$iv		# load IV
 	lea	-9(%r14),%r14

 	vmovdqa	0x00(%r12,%r14,8),$mask14
 	vmovdqa	0x10(%r12,%r14,8),$mask12
 	vmovdqa	0x20(%r12,%r14,8),$mask10

 	sub	\$-16*$SZ,%r13		# inp++, size optimization
 	mov	$SZ*0(%r15),$A
 	lea	(%rsi,%r13),%r12	# borrow $a0
 	mov	$SZ*1(%r15),$B
 	cmp	$len,%r13		# $_end
 	mov	$SZ*2(%r15),$C
 	cmove	%rsp,%r12		# next block or random data
 	mov	$SZ*3(%r15),$D
 	mov	$SZ*4(%r15),$E
 	mov	$SZ*5(%r15),$F
 	mov	$SZ*6(%r15),$G
 	mov	$SZ*7(%r15),$H
 	vmovdqu	0x00-0x80($inp),$roundkey
 ___
 					if ($SZ==4) {	# SHA256
     my @X = map("%ymm$_",(0..3));
     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));

 $code.=<<___;
 	jmp	.Loop_avx2
 .align	16
 .Loop_avx2:
 	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
 	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
 	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
 	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
 	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3

 	vinserti128	\$1,(%r12),@X[0],@X[0]
 	vinserti128	\$1,16(%r12),@X[1],@X[1]
 	 vpshufb	$t3,@X[0],@X[0]
 	vinserti128	\$1,32(%r12),@X[2],@X[2]
 	 vpshufb	$t3,@X[1],@X[1]
 	vinserti128	\$1,48(%r12),@X[3],@X[3]

 	lea	$TABLE(%rip),$Tbl
 	vpshufb	$t3,@X[2],@X[2]
 	lea	-16*$SZ(%r13),%r13
 	vpaddd	0x00($Tbl),@X[0],$t0
 	vpshufb	$t3,@X[3],@X[3]
 	vpaddd	0x20($Tbl),@X[1],$t1
 	vpaddd	0x40($Tbl),@X[2],$t2
 	vpaddd	0x60($Tbl),@X[3],$t3
 	vmovdqa	$t0,0x00(%rsp)
 	xor	$a1,$a1
 	vmovdqa	$t1,0x20(%rsp)
 ___
 $code.=<<___ if (!$win64);
 # temporarily use %rsi as frame pointer
         mov     $_rsp,%rsi
 .cfi_def_cfa    %rsi,8
 ___
 $code.=<<___;
 	lea	-$PUSH8(%rsp),%rsp
 ___
 $code.=<<___ if (!$win64);
 # the frame info is at $_rsp, but the stack is moving...
 # so a second frame pointer is saved at -8(%rsp)
 # that is in the red zone
         mov     %rsi,-8(%rsp)
 .cfi_cfa_expression     %rsp-8,deref,+8
 ___
 $code.=<<___;
 	mov	$B,$a3
 	vmovdqa	$t2,0x00(%rsp)
 	xor	$C,$a3			# magic
 	vmovdqa	$t3,0x20(%rsp)
 	mov	$F,$a4
 	sub	\$-16*2*$SZ,$Tbl	# size optimization
 	jmp	.Lavx2_00_47

 .align	16
 .Lavx2_00_47:
 	vmovdqu	(%r13),$inout
 	vpinsrq	\$0,%r13,$offload,$offload
 ___

 sub AVX2_256_00_47 () {
 my $j = shift;
 my $body = shift;
 my @X = @_;
 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
 my $base = "+2*$PUSH8(%rsp)";

 	if (($j%2)==0) {
 	&lea	("%rsp","-$PUSH8(%rsp)");
 $code.=<<___ if (!$win64);
 .cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
 # copy secondary frame pointer to new location again at -8(%rsp)
         pushq   $PUSH8-8(%rsp)
 .cfi_cfa_expression     %rsp,deref,+8
         lea     8(%rsp),%rsp
 .cfi_cfa_expression     %rsp-8,deref,+8
 ___
 	}
 	foreach (Xupdate_256_AVX()) {		# 29 instructions
 	    eval;
 	    eval(shift(@insns));
 	    eval(shift(@insns));
 	    eval(shift(@insns));
 	}
 	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
 	  foreach (@insns) { eval; }		# remaining instructions
 	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
 }
     $aesni_cbc_idx=0;
     for ($i=0,$j=0; $j<4; $j++) {
 	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
 	push(@X,shift(@X));			# rotate(@X)
     }
 	&vmovq		("%r13",$offload);	# borrow $a0
 	&vpextrq	("%r15",$offload,1);	# borrow $a2
 	&vpand		($temp,$temp,$mask14);
 	&vpor		($iv,$iv,$temp);
 	&vmovdqu	("(%r15,%r13)",$iv);	# write output
 	&lea		("%r13","16(%r13)");	# inp++

 	&lea	($Tbl,16*2*$SZ."($Tbl)");
 	&cmpb	(($SZ-1)."($Tbl)",0);
 	&jne	(".Lavx2_00_47");

 	&vmovdqu	($inout,"(%r13)");
 	&vpinsrq	($offload,$offload,"%r13",0);

     $aesni_cbc_idx=0;
     for ($i=0; $i<16; ) {
 	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
 	foreach(bodyx_00_15()) { eval; }
     }
 					}
 $code.=<<___;
 	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
 	vmovq	$offload,%r13			# $_inp, borrow $a0
 	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
 	add	$a1,$A
 	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl

 	vpand	$mask14,$temp,$temp
 	vpor	$temp,$iv,$iv
 	vmovdqu	$iv,(%r12,%r13)			# write output
 	lea	16(%r13),%r13

 	add	$SZ*0(%r15),$A
 	add	$SZ*1(%r15),$B
 	add	$SZ*2(%r15),$C
 	add	$SZ*3(%r15),$D
 	add	$SZ*4(%r15),$E
 	add	$SZ*5(%r15),$F
 	add	$SZ*6(%r15),$G
 	add	$SZ*7(%r15),$H

 	mov	$A,$SZ*0(%r15)
 	mov	$B,$SZ*1(%r15)
 	mov	$C,$SZ*2(%r15)
 	mov	$D,$SZ*3(%r15)
 	mov	$E,$SZ*4(%r15)
 	mov	$F,$SZ*5(%r15)
 	mov	$G,$SZ*6(%r15)
 	mov	$H,$SZ*7(%r15)

 	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
 	je	.Ldone_avx2

 	xor	$a1,$a1
 	mov	$B,$a3
 	mov	$F,$a4
 	xor	$C,$a3			# magic
 	jmp	.Lower_avx2
 .align	16
 .Lower_avx2:
 	vmovdqu	(%r13),$inout
 	vpinsrq	\$0,%r13,$offload,$offload
 ___
     $aesni_cbc_idx=0;
     for ($i=0; $i<16; ) {
 	my $base="+16($Tbl)";
 	foreach(bodyx_00_15()) { eval; }
 	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
     }
 $code.=<<___;
 	vmovq	$offload,%r13			# borrow $a0
 	vpextrq	\$1,$offload,%r15		# borrow $a2
 	vpand	$mask14,$temp,$temp
 	vpor	$temp,$iv,$iv
 	lea	-$PUSH8($Tbl),$Tbl
 	vmovdqu	$iv,(%r15,%r13)			# write output
 	lea	16(%r13),%r13			# inp++
 	cmp	%rsp,$Tbl
 	jae	.Lower_avx2

 	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
 	lea	16*$SZ(%r13),%r13
 	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
 	add	$a1,$A
 	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp

 	add	$SZ*0(%r15),$A
 	add	$SZ*1(%r15),$B
 	add	$SZ*2(%r15),$C
 	add	$SZ*3(%r15),$D
 	add	$SZ*4(%r15),$E
 	add	$SZ*5(%r15),$F
 	add	$SZ*6(%r15),$G
 	lea	(%rsi,%r13),%r12
 	add	$SZ*7(%r15),$H

 	cmp	$_end,%r13

 	mov	$A,$SZ*0(%r15)
 	cmove	%rsp,%r12		# next block or stale data
 	mov	$B,$SZ*1(%r15)
 	mov	$C,$SZ*2(%r15)
 	mov	$D,$SZ*3(%r15)
 	mov	$E,$SZ*4(%r15)
 	mov	$F,$SZ*5(%r15)
 	mov	$G,$SZ*6(%r15)
 	mov	$H,$SZ*7(%r15)

 	jbe	.Loop_avx2
 	lea	(%rsp),$Tbl
 # temporarily use $Tbl as index to $_rsp
 # this avoids the need to save a secondary frame pointer at -8(%rsp)
 .cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8

 .Ldone_avx2:
 	mov	16*$SZ+4*8($Tbl),$ivp
 	mov	16*$SZ+7*8($Tbl),%rsi
 .cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
 $code.=<<___ if ($win64);
 	movaps	`$framesz+16*0`($Tbl),%xmm6
 	movaps	`$framesz+16*1`($Tbl),%xmm7
 	movaps	`$framesz+16*2`($Tbl),%xmm8
 	movaps	`$framesz+16*3`($Tbl),%xmm9
 	movaps	`$framesz+16*4`($Tbl),%xmm10
 	movaps	`$framesz+16*5`($Tbl),%xmm11
 	movaps	`$framesz+16*6`($Tbl),%xmm12
 	movaps	`$framesz+16*7`($Tbl),%xmm13
 	movaps	`$framesz+16*8`($Tbl),%xmm14
 	movaps	`$framesz+16*9`($Tbl),%xmm15
 ___
 $code.=<<___;
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
 .cfi_restore	%r14
 	mov	-32(%rsi),%r13
 .cfi_restore	%r13
 	mov	-24(%rsi),%r12
 .cfi_restore	%r12
 	mov	-16(%rsi),%rbp
 .cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
 .cfi_restore	%rbx
 	lea	(%rsi),%rsp
 .cfi_def_cfa_register	%rsp
 .Lepilogue_avx2:
 	ret
 .cfi_endproc
 .size	${func}_avx2,.-${func}_avx2
 ___
 }}
 }}
 {{
 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");

 my ($rounds,$Tbl)=("%r11d","%rbx");

 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
 my @rndkey=("%xmm4","%xmm5");
 my $r=0;
 my $sn=0;

 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
 my @MSG=map("%xmm$_",(10..13));

 my $aesenc=sub {
   use integer;
   my ($n,$k)=($r/10,$r%10);
     if ($k==0) {
       $code.=<<___;
 	movups		`16*$n`($in0),$in		# load input
 	xorps		$rndkey0,$in
 ___
       $code.=<<___ if ($n);
 	movups		$iv,`16*($n-1)`($out,$in0)	# write output
 ___
       $code.=<<___;
 	xorps		$in,$iv
 	movups		`32+16*$k-112`($key),$rndkey[1]
 	aesenc		$rndkey[0],$iv
 ___
     } elsif ($k==9) {
       $sn++;
       $code.=<<___;
 	cmp		\$11,$rounds
 	jb		.Laesenclast$sn
 	movups		`32+16*($k+0)-112`($key),$rndkey[1]
 	aesenc		$rndkey[0],$iv
 	movups		`32+16*($k+1)-112`($key),$rndkey[0]
 	aesenc		$rndkey[1],$iv
 	je		.Laesenclast$sn
 	movups		`32+16*($k+2)-112`($key),$rndkey[1]
 	aesenc		$rndkey[0],$iv
 	movups		`32+16*($k+3)-112`($key),$rndkey[0]
 	aesenc		$rndkey[1],$iv
 .Laesenclast$sn:
 	aesenclast	$rndkey[0],$iv
 	movups		16-112($key),$rndkey[1]		# forward reference
 	nop
 ___
     } else {
       $code.=<<___;
 	movups		`32+16*$k-112`($key),$rndkey[1]
 	aesenc		$rndkey[0],$iv
 ___
     }
     $r++;	unshift(@rndkey,pop(@rndkey));
 };

 if ($shaext) {
 my $Tbl="%rax";

 $code.=<<___;
 .type	${func}_shaext,\@function,6
 .align	32
 ${func}_shaext:
 .cfi_startproc
 	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
 ___
 $code.=<<___ if ($win64);
 	lea	`-8-10*16`(%rsp),%rsp
 	movaps	%xmm6,-8-10*16(%rax)
 	movaps	%xmm7,-8-9*16(%rax)
 	movaps	%xmm8,-8-8*16(%rax)
 	movaps	%xmm9,-8-7*16(%rax)
 	movaps	%xmm10,-8-6*16(%rax)
 	movaps	%xmm11,-8-5*16(%rax)
 	movaps	%xmm12,-8-4*16(%rax)
 	movaps	%xmm13,-8-3*16(%rax)
 	movaps	%xmm14,-8-2*16(%rax)
 	movaps	%xmm15,-8-1*16(%rax)
 .Lprologue_shaext:
 ___
 $code.=<<___;
 	lea		K256+0x80(%rip),$Tbl
 	movdqu		($ctx),$ABEF		# DCBA
 	movdqu		16($ctx),$CDGH		# HGFE
 	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask

 	mov		240($key),$rounds
 	sub		$in0,$out
 	movups		($key),$rndkey0		# $key[0]
 	movups		($ivp),$iv		# load IV
 	movups		16($key),$rndkey[0]	# forward reference
 	lea		112($key),$key		# size optimization

 	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
 	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
 	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
 	movdqa		$TMP,$BSWAP		# offload
 	palignr		\$8,$CDGH,$ABEF		# ABEF
 	punpcklqdq	$Wi,$CDGH		# CDGH

 	jmp	.Loop_shaext

 .align	16
 .Loop_shaext:
 	movdqu		($inp),@MSG[0]
 	movdqu		0x10($inp),@MSG[1]
 	movdqu		0x20($inp),@MSG[2]
 	pshufb		$TMP,@MSG[0]
 	movdqu		0x30($inp),@MSG[3]

 	movdqa		0*32-0x80($Tbl),$Wi
 	paddd		@MSG[0],$Wi
 	pshufb		$TMP,@MSG[1]
 	movdqa		$CDGH,$CDGH_SAVE	# offload
 	movdqa		$ABEF,$ABEF_SAVE	# offload
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 0-3
 	pshufd		\$0x0e,$Wi,$Wi
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF

 	movdqa		1*32-0x80($Tbl),$Wi
 	paddd		@MSG[1],$Wi
 	pshufb		$TMP,@MSG[2]
 	lea		0x40($inp),$inp
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 4-7
 	pshufd		\$0x0e,$Wi,$Wi
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF

 	movdqa		2*32-0x80($Tbl),$Wi
 	paddd		@MSG[2],$Wi
 	pshufb		$TMP,@MSG[3]
 	sha256msg1	@MSG[1],@MSG[0]
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 8-11
 	pshufd		\$0x0e,$Wi,$Wi
 	movdqa		@MSG[3],$TMP
 	palignr		\$4,@MSG[2],$TMP
 	paddd		$TMP,@MSG[0]
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF

 	movdqa		3*32-0x80($Tbl),$Wi
 	paddd		@MSG[3],$Wi
 	sha256msg2	@MSG[3],@MSG[0]
 	sha256msg1	@MSG[2],@MSG[1]
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 12-15
 	pshufd		\$0x0e,$Wi,$Wi
 ___
 	&$aesenc();
 $code.=<<___;
 	movdqa		@MSG[0],$TMP
 	palignr		\$4,@MSG[3],$TMP
 	paddd		$TMP,@MSG[1]
 	sha256rnds2	$CDGH,$ABEF
 ___
 for($i=4;$i<16-3;$i++) {
 	&$aesenc()	if (($r%10)==0);
 $code.=<<___;
 	movdqa		$i*32-0x80($Tbl),$Wi
 	paddd		@MSG[0],$Wi
 	sha256msg2	@MSG[0],@MSG[1]
 	sha256msg1	@MSG[3],@MSG[2]
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 16-19...
 	pshufd		\$0x0e,$Wi,$Wi
 	movdqa		@MSG[1],$TMP
 	palignr		\$4,@MSG[0],$TMP
 	paddd		$TMP,@MSG[2]
 ___
 	&$aesenc();
 	&$aesenc()	if ($r==19);
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF
 ___
 	push(@MSG,shift(@MSG));
 }
 $code.=<<___;
 	movdqa		13*32-0x80($Tbl),$Wi
 	paddd		@MSG[0],$Wi
 	sha256msg2	@MSG[0],@MSG[1]
 	sha256msg1	@MSG[3],@MSG[2]
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 52-55
 	pshufd		\$0x0e,$Wi,$Wi
 	movdqa		@MSG[1],$TMP
 	palignr		\$4,@MSG[0],$TMP
 	paddd		$TMP,@MSG[2]
 ___
 	&$aesenc();
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF

 	movdqa		14*32-0x80($Tbl),$Wi
 	paddd		@MSG[1],$Wi
 	sha256msg2	@MSG[1],@MSG[2]
 	movdqa		$BSWAP,$TMP
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 56-59
 	pshufd		\$0x0e,$Wi,$Wi
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF

 	movdqa		15*32-0x80($Tbl),$Wi
 	paddd		@MSG[2],$Wi
 ___
 	&$aesenc();
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$ABEF,$CDGH		# 60-63
 	pshufd		\$0x0e,$Wi,$Wi
 ___
 	&$aesenc();
 $code.=<<___;
 	sha256rnds2	$CDGH,$ABEF
 	#pxor		$CDGH,$rndkey0		# black magic
 ___
 	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
 $code.=<<___;
 	#xorps		$CDGH,$rndkey0		# black magic
 	paddd		$CDGH_SAVE,$CDGH
 	paddd		$ABEF_SAVE,$ABEF

 	dec		$len
 	movups		$iv,48($out,$in0)	# write output
 	lea		64($in0),$in0
 	jnz		.Loop_shaext

 	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
 	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
 	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
 	punpckhqdq	$CDGH,$ABEF		# DCBA
 	palignr		\$8,$TMP,$CDGH		# HGFE

 	movups		$iv,($ivp)		# write IV
 	movdqu		$ABEF,($ctx)
 	movdqu		$CDGH,16($ctx)
 ___
 $code.=<<___ if ($win64);
 	movaps	0*16(%rsp),%xmm6
 	movaps	1*16(%rsp),%xmm7
 	movaps	2*16(%rsp),%xmm8
 	movaps	3*16(%rsp),%xmm9
 	movaps	4*16(%rsp),%xmm10
 	movaps	5*16(%rsp),%xmm11
 	movaps	6*16(%rsp),%xmm12
 	movaps	7*16(%rsp),%xmm13
 	movaps	8*16(%rsp),%xmm14
 	movaps	9*16(%rsp),%xmm15
 	lea	8+10*16(%rsp),%rsp
 .Lepilogue_shaext:
 ___
 $code.=<<___;
 	ret
 .cfi_endproc
 .size	${func}_shaext,.-${func}_shaext
 ___
 }
 }}}}}

 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
 if ($win64 && $avx) {
 $rec="%rcx";
 $frame="%rdx";
 $context="%r8";
 $disp="%r9";

 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
 .type	se_handler,\@abi-omnipotent
 .align	16
 se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
 	pushfq
 	sub	\$64,%rsp

 	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip

 	mov	8($disp),%rsi		# disp->ImageBase
 	mov	56($disp),%r11		# disp->HanderlData

 	mov	0(%r11),%r10d		# HandlerData[0]
 	lea	(%rsi,%r10),%r10	# prologue label
 	cmp	%r10,%rbx		# context->Rip<prologue label
 	jb	.Lin_prologue

 	mov	152($context),%rax	# pull context->Rsp

 	mov	4(%r11),%r10d		# HandlerData[1]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 ___
 $code.=<<___ if ($shaext);
 	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
 	cmp	%r10,%rbx
 	jb	.Lnot_in_shaext

 	lea	(%rax),%rsi
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$20,%ecx
 	.long	0xa548f3fc		# cld; rep movsq
 	lea	168(%rax),%rax		# adjust stack pointer
 	jmp	.Lin_prologue
 .Lnot_in_shaext:
 ___
 $code.=<<___ if ($avx>1);
 	lea	.Lavx2_shortcut(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
 	jb	.Lnot_in_avx2

 	and	\$-256*$SZ,%rax
 	add	\$`2*$SZ*($rounds-8)`,%rax
 .Lnot_in_avx2:
 ___
 $code.=<<___;
 	mov	%rax,%rsi		# put aside Rsp
 	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp

 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
 	mov	-32(%rax),%r13
 	mov	-40(%rax),%r14
 	mov	-48(%rax),%r15
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
 	mov	%r13,224($context)	# restore context->R13
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15

 	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$20,%ecx
 	.long	0xa548f3fc		# cld; rep movsq

 .Lin_prologue:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
 	mov	%rsi,168($context)	# restore context->Rsi
 	mov	%rdi,176($context)	# restore context->Rdi

 	mov	40($disp),%rdi		# disp->ContextRecord
 	mov	$context,%rsi		# context
 	mov	\$154,%ecx		# sizeof(CONTEXT)
 	.long	0xa548f3fc		# cld; rep movsq

 	mov	$disp,%rsi
 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
 	mov	40(%rsi),%r10		# disp->ContextRecord
 	lea	56(%rsi),%r11		# &disp->HandlerData
 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
 	mov	%r10,32(%rsp)		# arg5
 	mov	%r11,40(%rsp)		# arg6
 	mov	%r12,48(%rsp)		# arg7
 	mov	%rcx,56(%rsp)		# arg8, (NULL)
 	call	*__imp_RtlVirtualUnwind(%rip)

 	mov	\$1,%eax		# ExceptionContinueSearch
 	add	\$64,%rsp
 	popfq
 	pop	%r15
 	pop	%r14
 	pop	%r13
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
 	pop	%rdi
 	pop	%rsi
 	ret
 .size	se_handler,.-se_handler

 .section	.pdata
 	.rva	.LSEH_begin_${func}_xop
 	.rva	.LSEH_end_${func}_xop
 	.rva	.LSEH_info_${func}_xop

 	.rva	.LSEH_begin_${func}_avx
 	.rva	.LSEH_end_${func}_avx
 	.rva	.LSEH_info_${func}_avx
 ___
 $code.=<<___ if ($avx>1);
 	.rva	.LSEH_begin_${func}_avx2
 	.rva	.LSEH_end_${func}_avx2
 	.rva	.LSEH_info_${func}_avx2
 ___
 $code.=<<___ if ($shaext);
 	.rva	.LSEH_begin_${func}_shaext
 	.rva	.LSEH_end_${func}_shaext
 	.rva	.LSEH_info_${func}_shaext
 ___
 $code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_${func}_xop:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]

 .LSEH_info_${func}_avx:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
 $code.=<<___ if ($avx>1);
 .LSEH_info_${func}_avx2:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
 ___
 $code.=<<___ if ($shaext);
 .LSEH_info_${func}_shaext:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
 ___
 }

 ####################################################################
 sub rex {
   local *opcode=shift;
   my ($dst,$src)=@_;
   my $rex=0;

     $rex|=0x04			if($dst>=8);
     $rex|=0x01			if($src>=8);
     unshift @opcode,$rex|0x40	if($rex);
 }

 {
   my %opcodelet = (
 		"sha256rnds2" => 0xcb,
   		"sha256msg1"  => 0xcc,
 		"sha256msg2"  => 0xcd	);

   sub sha256op38 {
     my $instr = shift;

     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
       my @opcode=(0x0f,0x38);
 	rex(\@opcode,$2,$1);
 	push @opcode,$opcodelet{$instr};
 	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
 	return ".byte\t".join(',',@opcode);
     } else {
 	return $instr."\t".@_[0];
     }
   }
 }

 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
 print $code;
 close STDOUT or die "error closing STDOUT: $!";