Various minor updates to AES assembler modules.
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 8b27e4c..b09bf02 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -2,8 +2,9 @@
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # Version 4.3.
@@ -105,6 +106,7 @@
 # P4		56[60]		84[100]		23
 # AMD K8	48[44]		70[79]		18
 # PIII		41[50]		61[91]		24
+# Core 2	32[38]		45[70]		18.5
 # Pentium	120		160		77
 #
 # Version 4.1 switches to compact S-box even in key schedule setup.
@@ -184,7 +186,8 @@
 # Current implementation accesses *all* cache-lines within ~50 cycles
 # window, which is actually *less* than RDTSC latency on Intel P4!
 
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
@@ -474,11 +477,10 @@
 	&mov	($acc,$s[$i]);
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($r2,$s[$i]);
 	&shr	($tmp,7);
-	&and	($r2,0x7f7f7f7f);
+	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
 	&sub	($acc,$tmp);
-	&lea	($r2,&DWP(0,$r2,$r2));
+	&and	($r2,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	&mov	($tmp,$s[$i]);
 	&xor	($acc,$r2);	# r2
@@ -1273,54 +1275,51 @@
 	&mov	($acc,$s[$i]);
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp2,$s[$i]);
 	&shr	($tmp,7);
-	&and	($tp2,0x7f7f7f7f);
+	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
 	&sub	($acc,$tmp);
-	&add	($tp2,$tp2);
+	&and	($tp2,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	&xor	($acc,$tp2);
 	&mov	($tp2,$acc);
 
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp4,$tp2);
-	 &xor	($tp2,$s[$i]);	# tp2^tp1
 	&shr	($tmp,7);
-	&and	($tp4,0x7f7f7f7f);
+	&lea	($tp4,&DWP(0,$tp2,$tp2));
 	&sub	($acc,$tmp);
-	&add	($tp4,$tp4);
+	&and	($tp4,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
+	 &xor	($tp2,$s[$i]);	# tp2^tp1
 	&xor	($acc,$tp4);
 	&mov	($tp4,$acc);
 
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp8,$tp4);
-	 &xor	($tp4,$s[$i]);	# tp4^tp1
 	&shr	($tmp,7);
-	&and	($tp8,0x7f7f7f7f);
+	&lea	($tp8,&DWP(0,$tp4,$tp4));
 	&sub	($acc,$tmp);
-	&add	($tp8,$tp8);
+	&and	($tp8,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
+	 &xor	($tp4,$s[$i]);	# tp4^tp1
 	 &rotl	($s[$i],8);	# = ROTATE(tp1,8)
 	&xor	($tp8,$acc);
 
 	&xor	($s[$i],$tp2);
 	&xor	($tp2,$tp8);
-	&xor	($s[$i],$tp4);
 	&rotl	($tp2,24);
+	&xor	($s[$i],$tp4);
 	&xor	($tp4,$tp8);
-	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
 	&rotl	($tp4,16);
-	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
+	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
 	&rotl	($tp8,8);
+	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
 	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
+	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
+	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
+	 &mov	($s[2],$__s2)			if($i==1);
 	&xor	($s[$i],$tp8);	# ^= ROTATE(tp8,8)
 
-	&mov	($s[0],$__s0)			if($i==2); #prefetch $s0
-	&mov	($s[1],$__s1)			if($i==3); #prefetch $s1
-	&mov	($s[2],$__s2)			if($i==1);
 	&mov	($s[3],$__s3)			if($i==1);
 	&mov	(&DWP(4+4*$i,"esp"),$s[$i])	if($i>=2);
 }
@@ -2872,35 +2871,32 @@
 	&mov	($acc,$tp1);
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp2,$tp1);
 	&shr	($tmp,7);
-	&and	($tp2,0x7f7f7f7f);
+	&lea	($tp2,&DWP(0,$tp1,$tp1));
 	&sub	($acc,$tmp);
-	&add	($tp2,$tp2);
+	&and	($tp2,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	&xor	($acc,$tp2);
 	&mov	($tp2,$acc);
 
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp4,$tp2);
-	 &xor	($tp2,$tp1);	# tp2^tp1
 	&shr	($tmp,7);
-	&and	($tp4,0x7f7f7f7f);
+	&lea	($tp4,&DWP(0,$tp2,$tp2));
 	&sub	($acc,$tmp);
-	&add	($tp4,$tp4);
+	&and	($tp4,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
+	 &xor	($tp2,$tp1);	# tp2^tp1
 	&xor	($acc,$tp4);
 	&mov	($tp4,$acc);
 
 	&and	($acc,0x80808080);
 	&mov	($tmp,$acc);
-	&mov	($tp8,$tp4);
-	 &xor	($tp4,$tp1);	# tp4^tp1
 	&shr	($tmp,7);
-	&and	($tp8,0x7f7f7f7f);
+	&lea	($tp8,&DWP(0,$tp4,$tp4));
+	 &xor	($tp4,$tp1);	# tp4^tp1
 	&sub	($acc,$tmp);
-	&add	($tp8,$tp8);
+	&and	($tp8,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	 &rotl	($tp1,8);	# = ROTATE(tp1,8)
 	&xor	($tp8,$acc);
@@ -2992,5 +2988,6 @@
 
 	&xor	("eax","eax");			# return success
 &function_end("AES_set_decrypt_key");
+&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index 685fccf..7219b30 100644
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -12,9 +12,9 @@
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
 # 4.0. But these are not the ones currently used! Their "compact"
-# counterparts are, for security reason. ppc_AES_crypt_compact runs at
-# 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3
-# of ppc_AES_decrypt.
+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
+# at 1/3 of ppc_AES_decrypt.
 
 $output = shift;
 
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 54a9fe5..b96f736 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -738,14 +738,8 @@
 	tmhl	%r0,`0x8000>>2`
 	jz	.Lekey_internal
 
-	l	$t1,0($inp)	# just copy 128 bits...
-	l	$t2,4($inp)
-	l	$bits,8($inp)
-	l	$inp,12($inp)
-	st	$t1,0($key)
-	st	$t2,4($key)
-	st	$bits,8($key)
-	st	$inp,12($key)
+	lmg	$t1,$t2,0($inp)	# just copy 128 bits...
+	stmg	$t1,$t2,0($key)
 	lghi	$t1,10
 	st	$t1,236($key)	# ... postpone key setup
 	st	$t1,240($key)
@@ -754,7 +748,7 @@
 
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all volatile regs, but $ra!
+	stmg	%r6,%r13,48($sp)	# all non-volatile regs
 
 	bras	$tbl,1f
 1:	aghi	$tbl,AES_Te+2048-.
@@ -949,7 +943,7 @@
 .align	16
 AES_set_decrypt_key:
 	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save [other] volatile registers!
+	stg	$ra,112($sp)		# save non-volatile registers!
 	bras	$ra,AES_set_encrypt_key
 	lg	$key,32($sp)
 	lg	$ra,112($sp)
@@ -963,14 +957,8 @@
 	c	$t1,236($key)
 	je	.Lgo
 
-	l	$t1,0($key)		# just copy 128 bits otherwise
-	l	$t2,4($key)
-	l	$t3,8($key)
-	l	$bits,12($key)
-	st	$t1,160($key)
-	st	$t2,164($key)
-	st	$t3,168($key)
-	st	$bits,172($key)
+	lmg	$t1,$t2,0($key)		# just copy 128 bits otherwise
+	stmg	$t1,$t2,160($key)
 	lghi	%r2,0
 	br	$ra
 
@@ -983,27 +971,16 @@
 	lg	$ra,40($sp)
 
 .Lgo:	llgf	$rounds,240($key)
-	lghi	$i1,0
+	la	$i1,0($key)
 	sllg	$i2,$rounds,4
+	la	$i2,0($i2,$key)
 	srl	$rounds,1
 
 .align	8
-.Linv:	l	$s0,0($i1,$key)
-	l	$s1,4($i1,$key)
-	l	$s2,8($i1,$key)
-	l	$s3,12($i1,$key)
-	l	$t1,0($i2,$key)
-	l	$t2,4($i2,$key)
-	l	$t3,8($i2,$key)
-	l	$i3,12($i2,$key)
-	st	$s0,0($i2,$key)
-	st	$s1,4($i2,$key)
-	st	$s2,8($i2,$key)
-	st	$s3,12($i2,$key)
-	st	$t1,0($i1,$key)
-	st	$t2,4($i1,$key)
-	st	$t3,8($i1,$key)
-	st	$i3,12($i1,$key)
+.Linv:	lmg	$s0,$s1,0($i1)
+	lmg	$s2,$s3,0($i2)
+	stmg	$s0,$s1,0($i2)
+	stmg	$s2,$s3,0($i1)
 	aghi	$i1,16
 	aghi	$i2,-16
 	brct	$rounds,.Linv
@@ -1070,7 +1047,7 @@
 	la	$key,4($key)
 	brct	$rounds,.Lmix
 
-	lmg	%r6,%r13,48($sp)# this was saved by AES_set_encrypt_key!
+	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
 .size	AES_set_decrypt_key,.-AES_set_decrypt_key