Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than
sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse
to "cpuid" assembler module and gain 2x.
diff --git a/Configure b/Configure
index 9b21095..b072bfb 100755
--- a/Configure
+++ b/Configure
@@ -1209,6 +1209,7 @@
 
 $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
 
+$cpuid_obj="mem_clr.o"	unless ($cpuid_obj =~ /\.o$/);
 $des_obj=$des_enc	unless ($des_obj =~ /\.o$/);
 $bf_obj=$bf_enc		unless ($bf_obj =~ /\.o$/);
 $cast_obj=$cast_enc	unless ($cast_obj =~ /\.o$/);
@@ -1481,7 +1482,7 @@
 print OUT $openssl_algorithm_defines_trans;
 print OUT "#endif\n\n";
 
-print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
+print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");
 
 while (<IN>)
 	{
diff --git a/crypto/Makefile b/crypto/Makefile
index efe6a79..1b8c7c2 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -34,7 +34,7 @@
 LIB= $(TOP)/libcrypto.a
 SHARED_LIB= libcrypto$(SHLIB_EXT)
 LIBSRC=	cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
-LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
+LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
 
 SRC= $(LIBSRC)
 
diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S
index 5836565..818e2d1 100644
--- a/crypto/ia64cpuid.S
+++ b/crypto/ia64cpuid.S
@@ -1,11 +1,13 @@
 // Works on all IA-64 platforms: Linux, HP-UX, Win64i...
 // On Win64i compile with ias.exe.
 .text
+
 .global	OPENSSL_cpuid_setup#
 .proc	OPENSSL_cpuid_setup#
 OPENSSL_cpuid_setup:
 { .mib;	br.ret.sptk.many	b0		};;
 .endp	OPENSSL_cpuid_setup#
+
 .global	OPENSSL_rdtsc#
 .proc	OPENSSL_rdtsc#
 OPENSSL_rdtsc:
@@ -124,3 +126,37 @@
 	mov		ar.lc=r3
 	br.ret.sptk	b0		};;
 .endp	OPENSSL_wipe_cpu#
+
+.global	OPENSSL_cleanse#
+.proc	OPENSSL_cleanse#
+OPENSSL_cleanse:
+{ .mib;	and		r2=7,r32
+	cmp.leu		p6,p0=15,r33	    // len>=15
+(p6)	br.cond.dptk	.Lot		};;
+
+.Little:
+{ .mib;	st1		[r32]=r0,1
+	cmp.ltu		p6,p7=1,r33	}  // len>1
+{ .mbb;	add		r33=-1,r33	   // len--
+(p6)	br.cond.dptk	.Little
+(p7)	br.ret.sptk.many	b0	};;
+
+.Lot:
+{ .mib;	cmp.eq		p6,p0=0,r2
+(p6)	br.cond.dptk	.Laligned	};;
+{ .mmi;	st1		[r32]=r0,1;;
+	and		r2=7,r32	}
+{ .mib;	add		r33=-1,r33
+	br		.Lot		};;
+
+.Laligned:
+{ .mmi;	st8		[r32]=r0,8
+	and		r2=-8,r33	    // len&~7
+	add		r33=-8,r33	};; // len-=8
+{ .mib;	cmp.ltu		p6,p0=8,r2	    // ((len+8)&~7)>8
+(p6)	br.cond.dptk	.Laligned	};;
+
+{ .mbb;	cmp.eq		p6,p7=r0,r33
+(p7)	br.cond.dpnt	.Little
+(p6)	br.ret.sptk.many	b0	};;
+.endp	OPENSSL_cleanse#
diff --git a/crypto/mem.c b/crypto/mem.c
index 6635167..43d48ab 100644
--- a/crypto/mem.c
+++ b/crypto/mem.c
@@ -250,7 +250,6 @@
 void *CRYPTO_malloc_locked(int num, const char *file, int line)
 	{
 	void *ret = NULL;
-	extern unsigned char cleanse_ctr;
 
 	if (num <= 0) return NULL;
 
@@ -267,11 +266,15 @@
 	if (malloc_debug_func != NULL)
 		malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+	{	extern unsigned char cleanse_ctr;
 		((unsigned char *)ret)[0] = cleanse_ctr;
+	}
+#endif
 
 	return ret;
 	}
@@ -291,7 +294,6 @@
 void *CRYPTO_malloc(int num, const char *file, int line)
 	{
 	void *ret = NULL;
-	extern unsigned char cleanse_ctr;
 
 	if (num <= 0) return NULL;
 
@@ -308,11 +310,15 @@
 	if (malloc_debug_func != NULL)
 		malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+	{	extern unsigned char cleanse_ctr;
                 ((unsigned char *)ret)[0] = cleanse_ctr;
+	}
+#endif
 
 	return ret;
 	}
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
index 52308ab..f691abc 100644
--- a/crypto/sparccpuid.S
+++ b/crypto/sparccpuid.S
@@ -232,6 +232,54 @@
 .type	_sparcv9_rdtick,#function
 .size	_sparcv9_rdtick,.-_sparcv9_rdtick
 
+.global	OPENSSL_cleanse
+.align	32
+OPENSSL_cleanse:
+	cmp	%o1,6
+	nop
+#ifdef ABI64
+	bgu	%xcc,.Lot
+#else
+	bgu	.Lot
+#endif
+	nop
+
+.Little:
+	stb	%g0,[%o0]
+	subcc	%o1,1,%o1
+	bnz	.Little
+	add	%o0,1,%o0
+	retl
+	nop
+.align	32
+.Lot:
+	andcc	%o0,3,%g0
+	bz	.Laligned
+	nop
+	stb	%g0,[%o0]
+	sub	%o1,1,%o1
+	ba	.Lot
+	add	%o0,1,%o0
+	nop
+.Laligned:
+	st	%g0,[%o0]
+	sub	%o1,4,%o1
+	andcc	%o1,-4,%g0
+#ifdef ABI64
+	bnz	%xcc,.Laligned
+#else
+	bnz	.Laligned
+#endif
+	add	%o0,4,%o0
+
+	cmp	%o1,0
+	bne	.Little
+	nop
+	retl
+	nop
+.type	OPENSSL_cleanse,#function
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
 .section	".init",#alloc,#execinstr
 	call	OPENSSL_cpuid_setup
 	nop
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index bc06e99..2f657ca 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -155,4 +155,36 @@
 	or	%rcx,%rax
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl  OPENSSL_cleanse
+.type   OPENSSL_cleanse,\@function,2
+.align  16
+OPENSSL_cleanse:
+	xor	%rax,%rax
+	cmp	\$15,%rsi
+	jae	.Lot
+.Little:
+	mov	%al,(%rdi)
+	sub	\$1,%rsi
+	lea	1(%rdi),%rdi
+	jnz	.Little
+	ret
+.align	16
+.Lot:
+	test	\$7,%rdi
+	jz	.Laligned
+	mov	%al,(%rdi)
+	lea	-1(%rsi),%rsi
+	lea	1(%rdi),%rdi
+	jmp	.Lot
+.Laligned:
+	mov	%rax,(%rdi)
+	lea	-8(%rsi),%rsi
+	test	\$-8,%rsi
+	lea	8(%rdi),%rdi
+	jnz	.Laligned
+	cmp	\$0,%rsi
+	jne	.Little
+	ret
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
 ___
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index 7d924a6..13828d5 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -216,6 +216,37 @@
 	}
 &function_end_B("OPENSSL_indirect_call");
 
+&function_begin_B("OPENSSL_cleanse");
+	&mov	("edx",&wparam(0));
+	&mov	("ecx",&wparam(1));
+	&xor	("eax","eax");
+	&cmp	("ecx",7);
+	&jae	(&label("lot"));
+&set_label("little");
+	&mov	(&BP(0,"edx"),"al");
+	&sub	("ecx",1);
+	&lea	("edx",&DWP(1,"edx"));
+	&jnz	(&label("little"));
+	&ret	();
+
+&set_label("lot",16);
+	&test	("edx",3);
+	&jz	(&label("aligned"));
+	&mov	(&BP(0,"edx"),"al");
+	&lea	("ecx",&DWP(-1,"ecx"));
+	&lea	("edx",&DWP(1,"edx"));
+	&jmp	(&label("lot"));
+&set_label("aligned");
+	&mov	(&DWP(0,"edx"),"eax");
+	&lea	("ecx",&DWP(-4,"ecx"));
+	&test	("ecx",-4);
+	&lea	("edx",&DWP(4,"edx"));
+	&jnz	(&label("aligned"));
+	&cmp	("ecx",0);
+	&jne	(&label("little"));
+	&ret	();
+&function_end_B("OPENSSL_cleanse");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();