Latest bn_mont.c modification broke ECDSA test. I've got math wrong, which
is fixed now.
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 4339aab..5817538 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -243,7 +243,7 @@
 
 	/* mont->ri will be a multiple of the word size and below code
 	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-	if (r->top < ri)
+	if (r->top <= ri)
 		{
 		ret->top=0;
 		return(1);
@@ -259,32 +259,26 @@
 
 	rp=ret->d;
 	ap=&(r->d[ri]);
-	nrp=ap;
 
-	/* This 'if' denotes violation of 2*M<r^(n-1) boundary condition
-	 * formulated by C.D.Walter in "Montgomery exponentiation needs
-	 * no final subtractions." Incurred branch can disclose only
-	 * information about modulus length, which is not really secret. */
-	if ((mont->N.d[ri-1]>>(BN_BITS2-2))!=0)
-		{
-		size_t m1,m2;
+	{
+	size_t m1,m2;
 
-		v=bn_sub_words(rp,ap,mont->N.d,ri);
-		/* this -----------------------^^ works even in al<ri case
-		 * thanks to zealous zeroing of top of the vector in the
-		 * beginning. */
+	v=bn_sub_words(rp,ap,np,ri);
+	/* this ----------------^^ works even in al<ri case
+	 * thanks to zealous zeroing of top of the vector in the
+	 * beginning. */
 
-		/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-		/* in other words if subtraction result is real, then
-		 * trick unconditional memcpy below to perform in-place
-		 * "refresh" instead of actual copy. */
-		m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
-		m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
-		m1|=m2;			/* (al!=ri) */
-		m1|=(0-(size_t)v);	/* (al!=ri || v) */
-		m1&=~m2;		/* (al!=ri || v) && !al>ri */
-		nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
-		}
+	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+	/* in other words if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
+	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
+	m1|=m2;			/* (al!=ri) */
+	m1|=(0-(size_t)v);	/* (al!=ri || v) */
+	m1&=~m2;		/* (al!=ri || v) && !al>ri */
+	nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+	}
 
 	/* 'i<ri' is chosen to eliminate dependency on input data, even
 	 * though it results in redundant copy in al<ri case. */