x86[_64]cpuid.pl: further refine shared cache detection.
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index a8ee099..862118f 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -50,6 +50,8 @@
 
 	xor	%eax,%eax
 	cpuid
+	mov	%eax,%r11d		# max value for standard query level
+
 	xor	%eax,%eax
 	cmp	\$0x756e6547,%ebx	# "Genu"
 	setne	%al
@@ -60,7 +62,6 @@
 	cmp	\$0x6c65746e,%ecx	# "ntel"
 	setne	%al
 	or	%eax,%r9d		# 0 indicates Intel CPU
-	mov	\$1,%r10d		# "number of [AMD] cores"
 	jz	.Lintel
 
 	cmp	\$0x68747541,%ebx	# "Auth"
@@ -74,10 +75,10 @@
 	or	%eax,%r10d		# 0 indicates AMD CPU
 	jnz	.Lintel
 
+	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
 	cmp	\$0x80000008,%eax
-	mov	\$1,%r10d		# "number of [AMD] cores"
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -85,7 +86,29 @@
 	movzb	%cl,%r10		# number of cores - 1
 	inc	%r10			# number of cores
 
+	mov	\$1,%eax
+	cpuid
+	bt	\$28,%edx		# test hyper-threading bit
+	jnc	.Ldone
+	shr	\$16,%ebx		# number of logical processors
+	cmp	%r10b,%bl
+	ja	.Ldone
+	and	\$0xefffffff,%edx	# ~(1<<28)
+	jmp	.Ldone
+
 .Lintel:
+	cmp	\$4,%r11d
+	mov	\$-1,%r10d
+	jb	.Lnocacheinfo
+
+	mov	\$4,%eax
+	mov	\$0,%ecx		# query L1D
+	cpuid
+	mov	%eax,%r10d
+	shr	\$14,%r10d
+	and	\$0xfff,%r10d		# number of cores -1 per L1D
+
+.Lnocacheinfo:
 	mov	\$1,%eax
 	cpuid
 	cmp	\$0,%r9d
@@ -98,8 +121,13 @@
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
 	jnc	.Ldone
+	and	\$0xefffffff,%edx	# ~(1<<28)
+	cmp	\$0,%r10d
+	je	.Ldone
+
+	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
-	cmp	%r10b,%bl		# see if cache is shared
+	cmp	\$1,%bl			# see if cache is shared
 	ja	.Ldone
 	and	\$0xefffffff,%edx	# ~(1<<28)
 .Ldone:
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index 36c79ca..e5dcc58 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -23,6 +23,8 @@
 	&jnc	(&label("done"));
 	&xor	("eax","eax");
 	&cpuid	();
+	&mov	("edi","eax");		# max value for standard query level
+
 	&xor	("eax","eax");
 	&cmp	("ebx",0x756e6547);	# "Genu"
 	&setne	(&LB("eax"));
@@ -33,7 +35,6 @@
 	&cmp	("ecx",0x6c65746e);	# "ntel"
 	&setne	(&LB("eax"));
 	&or	("ebp","eax");		# 0 indicates Intel CPU
-	&mov	("esi",1);		# "number of [AMD] cores"
 	&jz	(&label("intel"));
 
 	&cmp	("ebx",0x68747541);	# "Auth"
@@ -47,10 +48,10 @@
 	&or	("esi","eax");		# 0 indicates AMD CPU
 	&jnz	(&label("intel"));
 
+	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
 	&cmp	("eax",0x80000008);
-	&mov	("esi",1);		# "number of [AMD] cores"
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -58,7 +59,30 @@
 	&movz	("esi",&LB("ecx"));	# number of cores - 1
 	&inc	("esi");		# number of cores
 
+	&mov	("eax",1);
+	&cpuid	();
+	&bt	("edx",28);
+	&jnc	(&label("done"));
+	&shr	("ebx",16);
+	&and	("ebx",0xff);
+	&cmp	("ebx","esi");
+	&ja	(&label("done"));
+	&and	("edx",0xefffffff);	# clear hyper-threading bit
+	&jmp	(&label("done"));
+	
 &set_label("intel");
+	&cmp	("edi",4);
+	&mov	("edi",-1);
+	&jb	(&label("nocacheinfo"));
+
+	&mov	("eax",4);
+	&mov	("ecx",0);		# query L1D
+	&cpuid	();
+	&mov	("edi","eax");
+	&shr	("edi",14);
+	&and	("edi",0xfff);		# number of cores -1 per L1D
+
+&set_label("nocacheinfo");
 	&mov	("eax",1);
 	&cpuid	();
 	&cmp	("ebp",0);
@@ -70,17 +94,19 @@
 &set_label("notP4");
 	&bt	("edx",28);		# test hyper-threading bit
 	&jnc	(&label("done"));
+	&and	("edx",0xefffffff);
+	&cmp	("edi",0);
+	&je	(&label("done"));
+
+	&or	("edx",0x10000000);
 	&shr	("ebx",16);
-	&and	("ebx",0xff);
-	&cmp	("ebx","esi");		# see if cache is shared(*)
+	&cmp	(&LB("ebx"),1);
 	&ja	(&label("done"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
 &set_label("done");
 	&mov	("eax","edx");
 	&mov	("edx","ecx");
 &function_end("OPENSSL_ia32_cpuid");
-# (*)	on Core2 this value is set to 2 denoting the fact that L2
-#	cache is shared between cores.
 
 &external_label("OPENSSL_ia32cap_P");