Multiple assembler packs: add experimental memory bus instrumentation.
diff --git a/crypto/alphacpuid.pl b/crypto/alphacpuid.pl
index c9474ff..11f2e30 100644
--- a/crypto/alphacpuid.pl
+++ b/crypto/alphacpuid.pl
@@ -126,3 +126,93 @@
 .Ldone: ret	($26)
 .end	OPENSSL_cleanse
 ___
+{
+my ($out,$cnt,$max)=("\$16","\$17","\$18");
+my ($tick,$lasttick)=("\$19","\$20");
+my ($diff,$lastdiff)=("\$21","\$22");
+my ($v0,$ra,$sp,$zero)=("\$0","\$26","\$30","\$31");
+
+print <<___;
+.globl	OPENSSL_instrument_bus
+.ent	OPENSSL_instrument_bus
+OPENSSL_instrument_bus:
+	.frame	$sp,0,$ra
+	.prologue 0
+	mov	$cnt,$v0
+
+	rpcc	$lasttick
+	mov	0,$diff
+
+	ecb	($out)
+	ldl_l	$tick,0($out)
+	addl	$diff,$tick,$tick
+	mov	$tick,$diff
+	stl_c	$tick,0($out)
+	stl	$diff,0($out)
+
+.Loop:	rpcc	$tick
+	subq	$tick,$lasttick,$diff
+	mov	$tick,$lasttick
+
+	ecb	($out)
+	ldl_l	$tick,0($out)
+	addl	$diff,$tick,$tick
+	mov	$tick,$diff
+	stl_c	$tick,0($out)
+	stl	$diff,0($out)
+
+	subl	$cnt,1,$cnt
+	lda	$out,4($out)
+	bne	$cnt,.Loop
+
+	ret	($ra)
+.end	OPENSSL_instrument_bus
+
+.globl	OPENSSL_instrument_bus2
+.ent	OPENSSL_instrument_bus2
+OPENSSL_instrument_bus2:
+	.frame	$sp,0,$ra
+	.prologue 0
+	mov	$cnt,$v0
+
+	rpcc	$lasttick
+	mov	0,$diff
+
+	ecb	($out)
+	ldl_l	$tick,0($out)
+	addl	$diff,$tick,$tick
+	mov	$tick,$diff
+	stl_c	$tick,0($out)
+	stl	$diff,0($out)
+
+	rpcc	$tick
+	subq	$tick,$lasttick,$diff
+	mov	$tick,$lasttick
+	mov	$diff,$lastdiff
+.Loop2:
+	ecb	($out)
+	ldl_l	$tick,0($out)
+	addl	$diff,$tick,$tick
+	mov	$tick,$diff
+	stl_c	$tick,0($out)
+	stl	$diff,0($out)
+
+	subl	$max,1,$max
+	beq	$max,.Ldone2
+
+	rpcc	$tick
+	subq	$tick,$lasttick,$diff
+	mov	$tick,$lasttick
+	subq	$lastdiff,$diff,$tick
+	mov	$diff,$lastdiff
+	cmovne	$tick,1,$tick
+	subl	$cnt,$tick,$cnt
+	s4addq	$tick,$out,$out
+	bne	$cnt,.Loop2
+
+.Ldone2:
+	subl	$v0,$cnt,$v0
+	ret	($ra)
+.end	OPENSSL_instrument_bus2
+___
+}
diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S
index d705fff..dd27e16 100644
--- a/crypto/ia64cpuid.S
+++ b/crypto/ia64cpuid.S
@@ -26,7 +26,7 @@
 { .mii;	mov		ar.ccv=r2
 	add		r8=r2,r33
 	mov		r3=r2		};;
-{ .mmi;	mf
+{ .mmi;	mf;;
 	cmpxchg4.acq	r2=[r32],r8,ar.ccv
 	nop.i		0		};;
 { .mib;	cmp.ne		p6,p0=r2,r3
@@ -165,3 +165,89 @@
 (p7)	br.cond.dpnt	.Little
 (p6)	br.ret.sptk.many	b0	};;
 .endp	OPENSSL_cleanse#
+
+.global	OPENSSL_instrument_bus#
+.proc	OPENSSL_instrument_bus#
+OPENSSL_instrument_cache:
+{ .mmi;	mov		r2=r33
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+	addp4		r32=0,r32
+#endif
+					}
+{ .mmi;	mov		r8=ar.itc;;
+	mov		r10=r0
+	mov		r9=r8		};;
+
+{ .mmi;	fc		r32;;
+	ld4		r8=[r32]	};;
+{ .mmi;	mf
+	mov		ar.ccv=r8
+	add		r8=r8,r10	};;
+{ .mmi;	cmpxchg4.acq	r3=[r32],r8,ar.ccv
+					};;
+.Loop:
+{ .mmi;	mov		r8=ar.itc;;
+	sub		r10=r8,r9		// diff=tick-lasttick
+	mov		r9=r8		};;	// lasttick=tick
+{ .mmi;	fc		r32;;
+	ld4		r8=[r32]	};;
+{ .mmi;	mf
+	mov		ar.ccv=r8
+	add		r8=r8,r10	};;
+{ .mmi;	cmpxchg4.acq	r3=[r32],r8,ar.ccv
+	add		r33=-1,r33
+	add		r32=4,r32	};;
+{ .mib;	cmp4.ne		p6,p0=0,r33
+(p6)	br.cond.dptk	.Loop		};;
+
+{ .mib;	sub		r8=r2,r33
+	br.ret.sptk.many	b0	};;
+.endp	OPENSSL_instrument_bus#
+
+.global	OPENSSL_instrument_bus2#
+.proc	OPENSSL_instrument_bus2#
+OPENSSL_instrument_cache2:
+{ .mmi;	mov		r2=r33			// put aside cnt
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+	addp4		r32=0,r32
+#endif
+					}
+{ .mmi;	mov		r8=ar.itc;;
+	mov		r10=r0
+	mov		r9=r8		};;
+
+{ .mmi;	fc		r32;;
+	ld4		r8=[r32]	};;
+{ .mmi;	mf
+	mov		ar.ccv=r8
+	add		r8=r8,r10	};;
+{ .mmi;	cmpxchg4.acq	r3=[r32],r8,ar.ccv
+					};;
+
+{ .mmi;	mov		r8=ar.itc;;
+	sub		r10=r8,r9
+	mov		r9=r8		};;
+.Loop2:
+{ .mmi;	mov		r11=r10			// lastdiff=diff
+	add		r34=-1,r34	};;	// --max
+{ .mmi;	fc		r32;;
+	ld4		r8=[r32]
+	cmp4.eq		p6,p0=0,r34	};;
+{ .mmi;	mf
+	mov		ar.ccv=r8
+	add		r8=r8,r10	};;
+{ .mmb;	cmpxchg4.acq	r3=[r32],r8,ar.ccv
+(p6)	br.cond.spnt	.Ldone2		};;
+
+{ .mmi;	mov		r8=ar.itc;;
+	sub		r10=r8,r9		// diff=tick-lasttick
+	mov		r9=r8		};;	// lasttick=tick
+{ .mmi;	cmp.ne		p6,p0=r10,r11;;		// diff!=lastdiff
+(p6)	add		r33=-1,r33	};;	// conditional --cnt
+{ .mib;	cmp4.ne		p7,p0=0,r33
+(p6)	add		r32=4,r32		// conditional ++out
+(p7)	br.cond.dptk	.Loop2		};;
+.Ldone2:
+{ .mib;	sub		r8=r2,r33
+	br.ret.sptk.many	b0	};;
+.endp	OPENSSL_instrument_bus2#
diff --git a/crypto/pariscid.pl b/crypto/pariscid.pl
index 1ed5381..477ec9b 100644
--- a/crypto/pariscid.pl
+++ b/crypto/pariscid.pl
@@ -87,8 +87,8 @@
 	.PROCEND
 ___
 {
-$inp="%r26";
-$len="%r25";
+my $inp="%r26";
+my $len="%r25";
 
 $code.=<<___;
 	.EXPORT	OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
@@ -112,9 +112,9 @@
 
 Laligned
 	andcm		$len,%r1,%r28
-Loop
+Lot
 	$ST		%r0,0($inp)
-	addib,*<>	-$SIZE_T,%r28,Loop
+	addib,*<>	-$SIZE_T,%r28,Lot
 	ldo		$SIZE_T($inp),$inp
 
 	and,*<>		$len,%r1,$len
@@ -130,7 +130,93 @@
 	.PROCEND
 ___
 }
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
 
+$code.=<<___;
+	.EXPORT	OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+Loop
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,<>	-1,$cnt,Loop
+	addi		4,$out,$out
+
+	bv		($rp)
+	.EXIT
+	sub		$rv,$cnt,$rv
+	.PROCEND
+
+	.EXPORT	OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus2
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	sub		%r0,$cnt,$cnt
+
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+Loop2
+	copy		$diff,$lastdiff
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,=		-1,$max,Ldone2
+	nop
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+	cmpclr,<>	$lastdiff,$diff,$tick
+	ldi		1,$tick
+
+	ldi		1,%r1
+	xor		%r1,$tick,$tick
+	addb,<>		$tick,$cnt,Loop2
+	shladd,l	$tick,2,$out,$out
+Ldone2
+	bv		($rp)
+	.EXIT
+	add		$rv,$cnt,$rv
+	.PROCEND
+___
+}
 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
 $code =~ s/,\*/,/gm if ($SIZE_T==4);
 print $code;
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 2131d30..d6220e7 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -69,10 +69,10 @@
 .globl	.OPENSSL_atomic_add
 .align	4
 .OPENSSL_atomic_add:
-Loop:	lwarx	r5,0,r3
+Ladd:	lwarx	r5,0,r3
 	add	r0,r4,r5
 	stwcx.	r0,0,r3
-	bne-	Loop
+	bne-	Ladd
 	$SIGNX	r3,r0
 	blr
 
@@ -112,6 +112,89 @@
 	bne	Little
 	blr
 ___
+{
+my ($out,$cnt,$max)=("r3","r4","r5");
+my ($tick,$lasttick)=("r6","r7");
+my ($diff,$lastdiff)=("r8","r9");
+
+$code.=<<___;
+.globl	.OPENSSL_instrument_bus
+.align	4
+.OPENSSL_instrument_bus:
+	mtctr	$cnt
+
+	mftb	$lasttick		# collect 1st tick
+	li	$diff,0
+
+	dcbf	0,$out			# flush cache line
+	lwarx	$tick,0,$out		# load and lock
+	add	$tick,$tick,$diff
+	stwcx.	$tick,0,$out
+	stwx	$tick,0,$out
+
+Loop:	mftb	$tick
+	sub	$diff,$tick,$lasttick
+	mr	$lasttick,$tick
+	dcbf	0,$out			# flush cache line
+	lwarx	$tick,0,$out		# load and lock
+	add	$tick,$tick,$diff
+	stwcx.	$tick,0,$out
+	stwx	$tick,0,$out
+	addi	$out,$out,4		# ++$out
+	bdnz	Loop
+
+	mr	r3,$cnt
+	blr
+
+.globl	.OPENSSL_instrument_bus2
+.align	4
+.OPENSSL_instrument_bus2:
+	mr	r0,$cnt
+	slwi	$cnt,$cnt,2
+
+	mftb	$lasttick		# collect 1st tick
+	li	$diff,0
+
+	dcbf	0,$out			# flush cache line
+	lwarx	$tick,0,$out		# load and lock
+	add	$tick,$tick,$diff
+	stwcx.	$tick,0,$out
+	stwx	$tick,0,$out
+
+	mftb	$tick			# collect 1st diff
+	sub	$diff,$tick,$lasttick
+	mr	$lasttick,$tick
+	mr	$lastdiff,$diff
+Loop2:
+	dcbf	0,$out			# flush cache line
+	lwarx	$tick,0,$out		# load and lock
+	add	$tick,$tick,$diff
+	stwcx.	$tick,0,$out
+	stwx	$tick,0,$out
+
+	addic.	$max,$max,-1
+	beq	Ldone2
+
+	mftb	$tick
+	sub	$diff,$tick,$lasttick
+	mr	$lasttick,$tick
+	cmplw	7,$diff,$lastdiff
+	mr	$lastdiff,$diff
+
+	mfcr	$tick			# pull cr
+	not	$tick,$tick		# flip bits
+	rlwinm	$tick,$tick,1,29,29	# isolate flipped eq bit and scale
+
+	sub.	$cnt,$cnt,$tick		# conditional --$cnt
+	add	$out,$out,$tick		# conditional ++$out
+	bne	Loop2
+
+Ldone2:
+	srwi	$cnt,$cnt,2
+	sub	r3,r0,$cnt
+	blr
+___
+}
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S
index 0681534..3402a24 100644
--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S
@@ -93,6 +93,22 @@
 	br	%r14
 .size	OPENSSL_cleanse,.-OPENSSL_cleanse
 
+.globl	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,@function
+.align	16
+OPENSSL_instrument_bus:
+	lghi	%r2,0
+	br	%r14
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,@function
+.align	16
+OPENSSL_instrument_bus2:
+	lghi	%r2,0
+	br	%r14
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
 .section	.init
 	brasl	%r14,OPENSSL_cpuid_setup
 
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
index ae61f7f..329efcd 100644
--- a/crypto/sparccpuid.S
+++ b/crypto/sparccpuid.S
@@ -397,6 +397,102 @@
 .type	OPENSSL_cleanse,#function
 .size	OPENSSL_cleanse,.-OPENSSL_cleanse
 
+.global	_sparcv9_vis1_instrument_bus
+.align	8
+_sparcv9_vis1_instrument_bus:
+	mov	%o1,%o3					! save cnt
+	.word	0x99410000	!rd	%tick,%o4	! tick
+	mov	%o4,%o5					! lasttick = tick
+	set	0,%g4					! diff
+
+	andn	%o0,63,%g1
+	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
+	.word	0x8143e040	!membar	#Sync
+	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
+	.word	0x8143e040	!membar	#Sync
+	ld	[%o0],%o4
+	add	%o4,%g4,%g4
+	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
+
+.Loop:	.word	0x99410000	!rd	%tick,%o4
+	sub	%o4,%o5,%g4				! diff=tick-lasttick
+	mov	%o4,%o5					! lasttick=tick
+
+	andn	%o0,63,%g1
+	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
+	.word	0x8143e040	!membar	#Sync
+	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
+	.word	0x8143e040	!membar	#Sync
+	ld	[%o0],%o4
+	add	%o4,%g4,%g4
+	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
+	subcc	%o1,1,%o1				! --$cnt
+	bnz	.Loop
+	add	%o0,4,%o0				! ++$out
+
+	retl
+	mov	%o3,%o0
+.type	_sparcv9_vis1_instrument_bus,#function
+.size	_sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
+
+.global	_sparcv9_vis1_instrument_bus2
+.align	8
+_sparcv9_vis1_instrument_bus2:
+	mov	%o1,%o3					! save cnt
+	sll	%o1,2,%o1				! cnt*=4
+
+	.word	0x99410000	!rd	%tick,%o4	! tick
+	mov	%o4,%o5					! lasttick = tick
+	set	0,%g4					! diff
+
+	andn	%o0,63,%g1
+	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
+	.word	0x8143e040	!membar	#Sync
+	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
+	.word	0x8143e040	!membar	#Sync
+	ld	[%o0],%o4
+	add	%o4,%g4,%g4
+	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
+
+	.word	0x99410000	!rd	%tick,%o4	! tick
+	sub	%o4,%o5,%g4				! diff=tick-lasttick
+	mov	%o4,%o5					! lasttick=tick
+	mov	%g4,%g5					! lastdiff=diff
+.Loop2:
+	andn	%o0,63,%g1
+	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
+	.word	0x8143e040	!membar	#Sync
+	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
+	.word	0x8143e040	!membar	#Sync
+	ld	[%o0],%o4
+	add	%o4,%g4,%g4
+	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
+
+	subcc	%o2,1,%o2				! --max
+	bz	.Ldone2
+	nop
+
+	.word	0x99410000	!rd	%tick,%o4	! tick
+	sub	%o4,%o5,%g4				! diff=tick-lasttick
+	mov	%o4,%o5					! lasttick=tick
+	cmp	%g4,%g5
+	mov	%g4,%g5					! lastdiff=diff
+
+	.word	0x83408000	!rd	%ccr,%g1
+	and	%g1,4,%g1				! isolate zero flag
+	xor	%g1,4,%g1				! flip zero flag
+
+	subcc	%o1,%g1,%o1				! conditional --$cnt
+	bnz	.Loop2
+	add	%o0,%g1,%o0				! conditional ++$out
+
+.Ldone2:
+	srl	%o1,2,%o1
+	retl
+	sub	%o3,%o1,%o0
+.type	_sparcv9_vis1_instrument_bus2,#function
+.size	_sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
+
 .section	".init",#alloc,#execinstr
 	call	OPENSSL_cpuid_setup
 	nop
diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c
index ed195ab..ad4b3be 100644
--- a/crypto/sparcv9cap.c
+++ b/crypto/sparcv9cap.c
@@ -11,6 +11,7 @@
 #define SPARCV9_VIS1		(1<<2)
 #define SPARCV9_VIS2		(1<<3)	/* reserved */
 #define SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
+#define SPARCV9_BLK		(1<<5)	/* VIS1 block copy */
 
 static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
 
@@ -31,6 +32,8 @@
 unsigned long	_sparcv9_vis1_instrument(void);
 void		_sparcv9_vis2_probe(void);
 void		_sparcv9_fmadd_probe(void);
+size_t 		_sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+size_t		_sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
 
 unsigned long OPENSSL_rdtsc(void)
 	{
@@ -44,6 +47,24 @@
 		return _sparcv9_rdtick();
 	}
 
+size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+	{
+	if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+			SPARCV9_BLK)
+		return _sparcv9_vis1_instrument_bus(out,cnt);
+	else
+		return 0;
+	}
+
+size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
+	{
+	if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+			SPARCV9_BLK)
+		return _sparcv9_vis1_instrument_bus2(out,cnt,max);
+	else
+		return 0;
+	}
+
 #if 0 && defined(__sun) && defined(__SVR4)
 /* This code path is disabled, because of incompatibility of
  * libdevinfo.so.1 and libmalloc.so.1 (see below for details)
@@ -112,7 +133,7 @@
 	if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
 		{
 		if (strstr(si,"+vis"))
-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+			OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
 		if (strstr(si,"+vis2"))
 			{
 			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
@@ -169,7 +190,6 @@
 	char *e;
 	struct sigaction	common_act,ill_oact,bus_oact;
 	sigset_t		all_masked,oset;
-	int			sig;
 	static int trigger=0;
 
 	if (trigger) return;
@@ -211,7 +231,7 @@
 	if (sigsetjmp(common_jmp,1) == 0)
 		{
 		_sparcv9_vis1_probe();
-		OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+		OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
 		/* detect UltraSPARC-Tx, see sparccpud.S for details... */
 		if (_sparcv9_vis1_instrument() >= 12)
 			OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index c96821a..ecfcfc7 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -9,8 +9,9 @@
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
 
-if ($win64)	{ $arg1="%rcx"; $arg2="%rdx"; }
-else		{ $arg1="%rdi"; $arg2="%rsi"; }
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
 print<<___;
 .extern		OPENSSL_cpuid_setup
 .section	.init
@@ -228,5 +229,95 @@
 	ret
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
+{
+my $out="%r10";
+my $cnt="%rcx";
+my $max="%r11";
+my $lasttick="%r8d";
+my $lastdiff="%r9d";
+my $redzone=win64?8:-8;
+
+print<<___;
+.globl	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,\@abi-omnipotent
+.align	16
+OPENSSL_instrument_bus:
+	mov	$arg1,$out	# tribute to Win64
+	mov	$arg2,$cnt
+	mov	$arg2,$max
+
+	rdtsc			# collect 1st tick
+	mov	%eax,$lasttick	# lasttick = tick
+	mov	\$0,$lastdiff	# lastdiff = 0
+	clflush	($out)
+	lock
+	add	$lastdiff,($out)
+	jmp	.Loop
+.align	16
+.Loop:	rdtsc
+	mov	%eax,%edx
+	sub	$lasttick,%eax
+	mov	%edx,$lasttick
+	mov	%eax,$lastdiff
+	clflush	($out)
+	lock
+	add	%eax,($out)
+	lea	4($out),$out
+	sub	\$1,$cnt
+	jnz	.Loop
+
+	mov	$max,%rax
+	ret
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,\@abi-omnipotent
+.align	16
+OPENSSL_instrument_bus2:
+	mov	$arg1,$out	# tribute to Win64
+	mov	$arg2,$cnt
+	mov	$arg3,$max
+	mov	$cnt,$redzone(%rsp)
+
+	rdtsc			# collect 1st tick
+	mov	%eax,$lasttick	# lasttick = tick
+	mov	\$0,$lastdiff	# lastdiff = 0
+
+	clflush	($out)
+	lock
+	add	$lastdiff,($out)
+
+	rdtsc			# collect 1st diff
+	mov	%eax,%edx
+	sub	$lasttick,%eax	# diff
+	mov	%edx,$lasttick	# lasttick = tick
+	mov	%eax,$lastdiff	# lastdiff = diff
+.Loop2:
+	clflush	($out)
+	lock
+	add	%eax,($out)	# accumulate diff
+
+	sub	\$1,$max
+	jz	.Ldone2
+
+	rdtsc
+	mov	%eax,%edx
+	sub	$lasttick,%eax	# diff
+	mov	%edx,$lasttick	# lasttick = tick
+	cmp	$lastdiff,%eax
+	mov	%eax,$lastdiff	# lastdiff = diff
+	mov	\$0,%edx
+	setne	%dl
+	sub	%rdx,$cnt	# conditional --$cnt
+	lea	($out,%rdx,4),$out	# conditional ++$out
+	jnz	.Loop2
+
+.Ldone2:
+	mov	$redzone(%rsp),%rax
+	sub	$cnt,%rax
+	ret
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+___
+}
 
 close STDOUT;	# flush
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index a7464af..0513398 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -307,6 +307,108 @@
 	&ret	();
 &function_end_B("OPENSSL_cleanse");
 
+{
+my $lasttick = "esi";
+my $lastdiff = "ebx";
+my $out = "edi";
+my $cnt = "ecx";
+my $max = "ebp";
+
+&function_begin("OPENSSL_instrument_bus");
+    &mov	("eax",0);
+    if ($sse2) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"edx"),4);
+	&jnc	(&label("nogo"));	# no TSC
+	&bt	(&DWP(0,"edx"),19);
+	&jnc	(&label("nogo"));	# no CLFLUSH
+
+	&mov	($out,&wparam(0));	# load arguments
+	&mov	($cnt,&wparam(1));
+
+	# collect 1st tick
+	&rdtsc	();
+	&mov	($lasttick,"eax");	# lasttick = tick
+	&mov	($lastdiff,0);		# lastdiff = 0
+	&clflush(&DWP(0,$out));
+	&lock	();
+	&add	(&DWP(0,$out),$lastdiff);
+	&jmp	(&label("loop"));
+
+&set_label("loop",16);
+	&rdtsc	();
+	&mov	("edx","eax");		# put aside tick (yes, I neglect edx)
+	&sub	("eax",$lasttick);	# diff
+	&mov	($lasttick,"edx");	# lasttick = tick
+	&mov	($lastdiff,"eax");	# lastdiff = diff
+	&clflush(&DWP(0,$out));
+	&lock	();
+	&add	(&DWP(0,$out),"eax");	# accumulate diff
+	&lea	($out,&DWP(4,$out));	# ++$out
+	&sub	($cnt,1);		# --$cnt
+	&jnz	(&label("loop"));
+
+	&mov	("eax",&wparam(1));
+&set_label("nogo");
+    }
+&function_end("OPENSSL_instrument_bus");
+
+&function_begin("OPENSSL_instrument_bus2");
+    &mov	("eax",0);
+    if ($sse2) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"edx"),4);
+	&jnc	(&label("nogo"));	# no TSC
+	&bt	(&DWP(0,"edx"),19);
+	&jnc	(&label("nogo"));	# no CLFLUSH
+
+	&mov	($out,&wparam(0));	# load arguments
+	&mov	($cnt,&wparam(1));
+	&mov	($max,&wparam(2));
+
+	&rdtsc	();			# collect 1st tick
+	&mov	($lasttick,"eax");	# lasttick = tick
+	&mov	($lastdiff,0);		# lastdiff = 0
+
+	&clflush(&DWP(0,$out));
+	&lock	();
+	&add	(&DWP(0,$out),$lastdiff);
+
+	&rdtsc	();			# collect 1st diff
+	&mov	("edx","eax");		# put aside tick (yes, I neglect edx)
+	&sub	("eax",$lasttick);	# diff
+	&mov	($lasttick,"edx");	# lasttick = tick
+	&mov	($lastdiff,"eax");	# lastdiff = diff
+	&jmp	(&label("loop2"));
+
+&set_label("loop2",16);
+	&clflush(&DWP(0,$out));
+	&lock	();
+	&add	(&DWP(0,$out),"eax");	# accumulate diff
+
+	&sub	($max,1);
+	&jz	(&label("done2"));
+
+	&rdtsc	();
+	&mov	("edx","eax");		# put aside tick (yes, I neglect edx)
+	&sub	("eax",$lasttick);	# diff
+	&mov	($lasttick,"edx");	# lasttick = tick
+	&cmp	("eax",$lastdiff);
+	&mov	($lastdiff,"eax");	# lastdiff = diff
+	&mov	("edx",0);
+	&setne	("dl");
+	&sub	($cnt,"edx");		# conditional --$cnt
+	&lea	($out,&DWP(0,$out,"edx",4));	# conditional ++$out
+	&jnz	(&label("loop2"));
+
+&set_label("done2");
+	&mov	("eax",&wparam(1));
+	&sub	("eax",$cnt);
+&set_label("nogo");
+    }
+&function_end("OPENSSL_instrument_bus2");
+}
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
diff --git a/doc/crypto/OPENSSL_instrument_bus.pod b/doc/crypto/OPENSSL_instrument_bus.pod
new file mode 100644
index 0000000..539957b
--- /dev/null
+++ b/doc/crypto/OPENSSL_instrument_bus.pod
@@ -0,0 +1,42 @@
+=pod
+
+=head1 NAME
+
+OPENSSL_instrument_bus[2] - instrument references to memory bus
+
+=head1 SYNOPSIS
+
+ #ifdef OPENSSL_CPUID_OBJ
+ size_t OPENSSL_instrument_bus (int *vector,size_t num);
+ size_t OPENSSL_instrument_bus2(int *vector,size_t num,size_t max);
+ #endif
+
+=head1 DESCRIPTION
+
+It was empirically found that timings of references to primary memory
+are subject to irregular, apparently non-deterministic variations. The
+subroutines in question instrument these references for purposes of
+gathering entropy for random number generator. In order to make it
+bus-bound a 'flush cache line' instruction is used between probes. In
+addition probes are added to B<vector> elements in atomic or
+interlocked manner, which should contribute additional noise on
+multi-processor systems. This also means that B<vector[num]> should be
+zeroed upon invocation (if you want to retrieve actual probe values).
+
+OPENSSL_instrument_bus performs B<num> probes and records the number of
+oscillator cycles every probe took.
+
+OPENSSL_instrument_bus2 on the other hand B<accumulates> consecutive
+probes with the same value, i.e. in a way it records duration of
+periods when probe values appeared deterministic. The subroutine
+performs at most B<max> probes in attempt to fill the B<vector[num]>,
+with B<max> value of 0 meaning "as many as it takes."
+
+=head1 RETURN VALUE
+
+Return value of 0 indicates that CPU is not capable of performing the
+benchmark, either because oscillator counter or 'flush cache line' is
+not available on current platform. For reference, on x86 'flush cache
+line' was introduced with the SSE2 extensions.
+
+Otherwise number of recorded values is returned.