Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than
sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse
to "cpuid" assembler module and gain 2x.
diff --git a/crypto/mem.c b/crypto/mem.c
index 6635167..43d48ab 100644
--- a/crypto/mem.c
+++ b/crypto/mem.c
@@ -250,7 +250,6 @@
 void *CRYPTO_malloc_locked(int num, const char *file, int line)
 	{
 	void *ret = NULL;
-	extern unsigned char cleanse_ctr;
 
 	if (num <= 0) return NULL;
 
@@ -267,11 +266,15 @@
 	if (malloc_debug_func != NULL)
 		malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+	{	extern unsigned char cleanse_ctr;
 		((unsigned char *)ret)[0] = cleanse_ctr;
+	}
+#endif
 
 	return ret;
 	}
@@ -291,7 +294,6 @@
 void *CRYPTO_malloc(int num, const char *file, int line)
 	{
 	void *ret = NULL;
-	extern unsigned char cleanse_ctr;
 
 	if (num <= 0) return NULL;
 
@@ -308,11 +310,15 @@
 	if (malloc_debug_func != NULL)
 		malloc_debug_func(ret, num, file, line, 1);
 
+#ifndef OPENSSL_CPUID_OBJ
         /* Create a dependency on the value of 'cleanse_ctr' so our memory
          * sanitisation function can't be optimised out. NB: We only do
          * this for >2Kb so the overhead doesn't bother us. */
         if(ret && (num > 2048))
+	{	extern unsigned char cleanse_ctr;
                 ((unsigned char *)ret)[0] = cleanse_ctr;
+	}
+#endif
 
 	return ret;
 	}