des_quad_cksum() byte order bug fix.
See http://www.pdc.kth.se/kth-krb/

Their solution for CRAY is somewhat awkward.
I'll assume that a "short" is 32 bits on CRAY to avoid the
#ifdef _CRAY
    typedef struct {
        unsigned int a:32;
        unsigned int b:32;
    } XXX;
#else
    typedef DES_LONG XXX;
#endif
diff --git a/CHANGES b/CHANGES
index a97d3e6..72acb7d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,10 @@
 
  Changes between 0.9.5 and 0.9.5a  [XX XXX 2000]
 
+  *) des_quad_cksum() byte order bug fix.
+     [Ulf Möller, using the problem description in krb4-0.9.7, where
+      the solution is attributed to Derrick J Brashear <shadow@DEMENTIA.ORG>]
+
   *) Fix so V_ASN1_APP_CHOOSE works again: however its use is strongly
      discouraged.
      [Steve Henson, pointed out by Brian Korver <briank@cs.stanford.edu>]
diff --git a/crypto/des/destest.c b/crypto/des/destest.c
index 9ad4ecb..c929cc8 100644
--- a/crypto/des/destest.c
+++ b/crypto/des/destest.c
@@ -336,8 +336,15 @@
 	unsigned char cbc_in[40];
 	unsigned char cbc_out[40];
 	DES_LONG cs;
-	unsigned char qret[4][4],cret[8];
-	DES_LONG lqret[4];
+	unsigned char cret[8];
+#ifdef _CRAY
+        struct {
+            int a:32;
+            int b:32;
+        } lqret[2];
+#else
+        DES_LONG lqret[4];
+#endif
 	int num;
 	char *str;
 
@@ -701,43 +708,40 @@
 		}
 
 	printf("Doing quad_cksum\n");
-	/* This is obviously done this way especially to puzzle me. Although
-	   quad_cksum returns up to 4 groups of 8 bytes, this test gets it to
-	   produce 2 groups then treats them as 4 groups of 4 bytes.
-	   Ben 13 Feb 1999 */
-	cs=quad_cksum(cbc_data,(des_cblock *)qret,strlen((char *)cbc_data),2,
-		      &cbc_iv);
-
-	{ /* Big-endian fix */
-	static DES_LONG l=1;
-	static unsigned char *c=(unsigned char *)&l;
-	DES_LONG ll;
-
-	j=sizeof(lqret[0])-4;
-	for (i=0; i<4; i++)
-		{
-		lqret[i]=0;
-		memcpy(&(lqret[i]),&(qret[i][0]),4);
-		if (!c[0] && (j > 0))
-			lqret[i]=lqret[i]>>(j*8); /* For Cray */
-		}
-
-	if (!c[0])
-		{
-		ll=lqret[0]^lqret[3];
-		lqret[0]^=ll;
-		lqret[3]^=ll;
-		ll=lqret[1]^lqret[2];
-		lqret[1]^=ll;
-		lqret[2]^=ll;
-		}
-	}
+	cs=quad_cksum(cbc_data,(des_cblock *)lqret,
+		(long)strlen(cbc_data),2,(des_cblock *)cbc_iv);
 	if (cs != 0x70d7a63aL)
 		{
 		printf("quad_cksum error, ret %08lx should be 70d7a63a\n",
 			(unsigned long)cs);
 		err=1;
 		}
+#ifdef _CRAY
+	if (lqret[0].a != 0x327eba8dL)
+		{
+		printf("quad_cksum error, out[0] %08lx is not %08lx\n",
+			(unsigned long)lqret[0].a,0x327eba8dUL);
+		err=1;
+		}
+	if (lqret[0].b != 0x201a49ccL)
+		{
+		printf("quad_cksum error, out[1] %08lx is not %08lx\n",
+			(unsigned long)lqret[0].b,0x201a49ccUL);
+		err=1;
+		}
+	if (lqret[1].a != 0x70d7a63aL)
+		{
+		printf("quad_cksum error, out[2] %08lx is not %08lx\n",
+			(unsigned long)lqret[1].a,0x70d7a63aUL);
+		err=1;
+		}
+	if (lqret[1].b != 0x501c2c26L)
+		{
+		printf("quad_cksum error, out[3] %08lx is not %08lx\n",
+			(unsigned long)lqret[1].b,0x501c2c26UL);
+		err=1;
+		}
+#else
 	if (lqret[0] != 0x327eba8dL)
 		{
 		printf("quad_cksum error, out[0] %08lx is not %08lx\n",
@@ -763,6 +767,7 @@
 		err=1;
 		}
 #endif
+#endif
 
 	printf("input word alignment test");
 	for (i=0; i<4; i++)
diff --git a/crypto/des/qud_cksm.c b/crypto/des/qud_cksm.c
index 6ce8c61..5f0ec53 100644
--- a/crypto/des/qud_cksm.c
+++ b/crypto/des/qud_cksm.c
@@ -80,10 +80,14 @@
 	int i;
 	long l;
 	const unsigned char *cp;
-	unsigned char *lp;
+#ifdef _CRAY
+	short *lp;
+#else
+	DES_LONG *lp;
+#endif
 
 	if (out_count < 1) out_count=1;
-	lp = &(output[0])[0];
+	lp = (DES_LONG *) &(output[0])[0];
 
 	z0=Q_B0((*seed)[0])|Q_B1((*seed)[1])|Q_B2((*seed)[2])|Q_B3((*seed)[3]);
 	z1=Q_B0((*seed)[4])|Q_B1((*seed)[5])|Q_B2((*seed)[6])|Q_B3((*seed)[7]);
@@ -114,25 +118,10 @@
 			}
 		if (lp != NULL)
 			{
-			/* I believe I finally have things worked out.
-			 * The MIT library assumes that the checksum
-			 * is one huge number and it is returned in a
-			 * host dependant byte order.
-			 */
-			static DES_LONG ltmp=1;
-			static unsigned char *c=(unsigned char *)&ltmp;
-
-			if (c[0])
-				{
-				l2c(z0,lp);
-				l2c(z1,lp);
-				}
-			else
-				{
-				lp = &(output[out_count-i-1])[0];
-				l2n(z1,lp);
-				l2n(z0,lp);
-				}
+			/* The MIT library assumes that the checksum is
+			 * composed of 2*out_count 32 bit ints */
+			*lp++ = z0;
+			*lp++ = z1;
 			}
 		}
 	return(z0);