{arm64|x86_64}cpuid.pl: add special 16-byte case to OPENSSL_memcmp.

OPENSSL_memcmp is a must in GCM decrypt and general-purpose loop takes
quite a portion of execution time for short inputs, more than GHASH for
few-byte inputs according to profiler. Special 16-byte case takes it off
top five list in profiler output.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6312)
diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index daa2b17..06c8add 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -115,6 +115,19 @@
 CRYPTO_memcmp:
 	eor	w3,w3,w3
 	cbz	x2,.Lno_data	// len==0?
+	cmp	x2,#16
+	b.ne	.Loop_cmp
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x1]
+	eor	x8,x8,x10
+	eor	x9,x9,x11
+	orr	x8,x8,x9
+	mov	x0,#1
+	cmp	x8,#0
+	csel	x0,xzr,x0,eq
+	ret
+
+.align	4
 .Loop_cmp:
 	ldrb	w4,[x0],#1
 	ldrb	w5,[x1],#1
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index 513d005..6423e80 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -271,6 +271,18 @@
 	xor	%r10,%r10
 	cmp	\$0,$arg3
 	je	.Lno_data
+	cmp	\$16,$arg3
+	jne	.Loop_cmp
+	mov	($arg1),%r10
+	mov	8($arg1),%r11
+	mov	\$1,$arg3
+	xor	($arg2),%r10
+	xor	8($arg2),%r11
+	or	%r11,%r10
+	cmovnz	$arg3,%rax
+	ret
+
+.align	16
 .Loop_cmp:
 	mov	($arg1),%r10b
 	lea	1($arg1),$arg1