This update gets endianness-neutrality right and adds second required
entry point, md5_block_asm_data_order.
diff --git a/crypto/md5/asm/md5-ia64.S b/crypto/md5/asm/md5-ia64.S
index 9002632..73273fa 100644
--- a/crypto/md5/asm/md5-ia64.S
+++ b/crypto/md5/asm/md5-ia64.S
@@ -86,6 +86,9 @@
 #define	pPad2		p12
 #define	pPad3		p13
 #define	pSkip		p8
+//	This two below shall remain constant througout whole routine
+#define	pDataOrder	p14
+#define	pHostOrder	p15
 
 #define	A_		out24
 #define	B_		out25
@@ -159,6 +162,11 @@
 #define _NOUTPUT	0
 #define	_NROTATE	24	/* this must be <= _NINPUTS */
 
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#define	ADDP	addp4
+#else
+#define	ADDP	add
+#endif
 
 //	Macros for getting the left and right portions of little-endian words
 
@@ -225,78 +233,8 @@
 #define	LCSave		r21
 #define	PFSSave		r20
 #define	PRSave		r22
-#define	pAgain		p14
-#define	pOff		p14
-
-	.rodata
-	// Values are specified as bytes to ensure they are
-	// in little-endian byte-order.
-	.align 4
-md5_round_constants:
-	data1 0x78, 0xa4, 0x6a, 0xd7	//     0
-	data1 0x56, 0xb7, 0xc7, 0xe8	//     1
-	data1 0xdb, 0x70, 0x20, 0x24	//     2
-	data1 0xee, 0xce, 0xbd, 0xc1	//     3
-	data1 0xaf, 0x0f, 0x7c, 0xf5	//     4
-	data1 0x2a, 0xc6, 0x87, 0x47	//     5
-	data1 0x13, 0x46, 0x30, 0xa8	//     6
-	data1 0x01, 0x95, 0x46, 0xfd	//     7
-	data1 0xd8, 0x98, 0x80, 0x69	//     8
-	data1 0xaf, 0xf7, 0x44, 0x8b	//     9
-	data1 0xb1, 0x5b, 0xff, 0xff	//    10
-	data1 0xbe, 0xd7, 0x5c, 0x89	//    11
-	data1 0x22, 0x11, 0x90, 0x6b	//    12
-	data1 0x93, 0x71, 0x98, 0xfd	//    13
-	data1 0x8e, 0x43, 0x79, 0xa6	//    14
-	data1 0x21, 0x08, 0xb4, 0x49	//    15
-	data1 0x62, 0x25, 0x1e, 0xf6	//    16
-	data1 0x40, 0xb3, 0x40, 0xc0	//    17
-	data1 0x51, 0x5a, 0x5e, 0x26	//    18
-	data1 0xaa, 0xc7, 0xb6, 0xe9	//    19
-	data1 0x5d, 0x10, 0x2f, 0xd6	//    20
-	data1 0x53, 0x14, 0x44, 0x02	//    21
-	data1 0x81, 0xe6, 0xa1, 0xd8	//    22
-	data1 0xc8, 0xfb, 0xd3, 0xe7	//    23
-	data1 0xe6, 0xcd, 0xe1, 0x21	//    24
-	data1 0xd6, 0x07, 0x37, 0xc3	//    25
-	data1 0x87, 0x0d, 0xd5, 0xf4	//    26
-	data1 0xed, 0x14, 0x5a, 0x45	//    27
-	data1 0x05, 0xe9, 0xe3, 0xa9	//    28
-	data1 0xf8, 0xa3, 0xef, 0xfc	//    29
-	data1 0xd9, 0x02, 0x6f, 0x67	//    30
-	data1 0x8a, 0x4c, 0x2a, 0x8d	//    31
-	data1 0x42, 0x39, 0xfa, 0xff	//    32
-	data1 0x81, 0xf6, 0x71, 0x87	//    33
-	data1 0x22, 0x61, 0x9d, 0x6d	//    34
-	data1 0x0c, 0x38, 0xe5, 0xfd	//    35
-	data1 0x44, 0xea, 0xbe, 0xa4	//    36
-	data1 0xa9, 0xcf, 0xde, 0x4b	//    37
-	data1 0x60, 0x4b, 0xbb, 0xf6	//    38
-	data1 0x70, 0xbc, 0xbf, 0xbe	//    39
-	data1 0xc6, 0x7e, 0x9b, 0x28	//    40
-	data1 0xfa, 0x27, 0xa1, 0xea	//    41
-	data1 0x85, 0x30, 0xef, 0xd4	//    42
-	data1 0x05, 0x1d, 0x88, 0x04	//    43
-	data1 0x39, 0xd0, 0xd4, 0xd9	//    44
-	data1 0xe5, 0x99, 0xdb, 0xe6	//    45
-	data1 0xf8, 0x7c, 0xa2, 0x1f	//    46
-	data1 0x65, 0x56, 0xac, 0xc4	//    47
-	data1 0x44, 0x22, 0x29, 0xf4	//    48
-	data1 0x97, 0xff, 0x2a, 0x43	//    49
-	data1 0xa7, 0x23, 0x94, 0xab	//    50
-	data1 0x39, 0xa0, 0x93, 0xfc	//    51
-	data1 0xc3, 0x59, 0x5b, 0x65	//    52
-	data1 0x92, 0xcc, 0x0c, 0x8f	//    53
-	data1 0x7d, 0xf4, 0xef, 0xff	//    54
-	data1 0xd1, 0x5d, 0x84, 0x85	//    55
-	data1 0x4f, 0x7e, 0xa8, 0x6f	//    56
-	data1 0xe0, 0xe6, 0x2c, 0xfe	//    57
-	data1 0x14, 0x43, 0x01, 0xa3	//    58
-	data1 0xa1, 0x11, 0x08, 0x4e	//    59
-	data1 0x82, 0x7e, 0x53, 0xf7	//    60
-	data1 0x35, 0xf2, 0x3a, 0xbd	//    61
-	data1 0xbb, 0xd2, 0xd7, 0x2a	//    62
-	data1 0x91, 0xd3, 0x86, 0xeb	//    63
+#define	pAgain		p63
+#define	pOff		p63
 
 	.text
 
@@ -320,53 +258,48 @@
 
    */
 
+	.type	md5_block_asm_data_order, @function
+	.global	md5_block_asm_data_order
+	.align	32
+	.proc	md5_block_asm_data_order
+md5_block_asm_data_order:
+{	.mib
+	cmp.eq	pDataOrder,pHostOrder = r0,r0
+	br.sptk.many	.md5_block
+};;
+	.endp	md5_block_asm_data_order
+
 	.type	md5_block_asm_host_order, @function
 	.global	md5_block_asm_host_order
 
-	.align	32
 	.proc	md5_block_asm_host_order
 md5_block_asm_host_order:
 	.prologue
-#ifndef __LP64__
+{	.mib
+	cmp.eq	pHostOrder,pDataOrder = r0,r0
+};;
+.md5_block:
 {	.mmi
-	.save ar.pfs, PFSSave
+	.save	ar.pfs, PFSSave
 	alloc	PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
-	addp4	DPtrIn = 0, DPtrIn
-	addp4	CtxPtr0 = 0, CtxPtr0
+	ADDP	CtxPtr1 = 8, CtxPtr0
+	mov	CTable = ip
+}
+{	.mmi
+	ADDP	DPtrIn = 0, DPtrIn
+	ADDP	CtxPtr0 = 0, CtxPtr0
+	.save	ar.lc, LCSave
+	mov	LCSave = ar.lc
 }
 ;;
+.pred.rel	"mutex",pDataOrder,pHostOrder
 {	.mmi
-	nop	0x0
+(pDataOrder)	add	CTable = .md5_tbl_data_order#-.md5_block#, CTable
+(pHostOrder)	add	CTable = .md5_tbl_host_order#-.md5_block#, CTable	
 	and	InAlign = 0x3, DPtrIn
-	.save ar.lc, LCSave
-	mov	LCSave = ar.lc
 }
-#else
-{	.mmi
-	.save ar.pfs, PFSSave
-	alloc	PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
-	and	InAlign = 0x3, DPtrIn
-	.save ar.lc, LCSave
-	mov	LCSave = ar.lc
-}
-#endif
 
 {	.mmi
-	addl	CTable = @ltoffx(md5_round_constants), gp
-	;;
-	ld8.mov	CTable = [CTable], md5_round_constants // native byte-order
-	add	CtxPtr1 = 8, CtxPtr0
-}
-#ifdef B_ENDIAN
-{
-	.mmi
-	rum	psr.be		// switch to little-endian mode
-	nop.m	0x0
-	nop.i	0x0
-}
-#endif
-;;
-{	.mmi
 	ld4	AccumA = [CtxPtr0], 4
 	ld4	AccumC = [CtxPtr1], 4
 	.save pr, PRSave
@@ -379,15 +312,12 @@
 	ld4	AccumD = [CtxPtr1]
 	dep	DPtr_ = 0, DPtrIn, 0, 2
 } ;;
-
-{	.mmi
+#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
+(pDataOrder)	rum	psr.be;;	// switch to little-endian
+#endif
+{	.mmb
 	ld4	CTable0 = [CTable], 4
 	cmp.ne	pOff, p0 = 0, InAlign
-} ;;
-
-{	.mib
-	nop.m 0x0
-	nop.i 0x0
 (pOff)	br.cond.spnt.many .md5_unaligned
 } ;;
 
@@ -431,9 +361,9 @@
 } ;;
 
 .md5_exit:
-//	Note that we switch back to the entry endianess AFTER storing so
-//	that the memory image of the hash is preserved.
-
+#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
+(pDataOrder)	sum	psr.be;;	// switch back to big-endian mode
+#endif
 {	.mmi
 	st4	[CtxPtr0] = AccumB, -4
 	st4	[CtxPtr1] = AccumD, -4
@@ -445,9 +375,6 @@
 	mov	ar.lc = LCSave
 } ;;
 {	.mib
-#ifdef B_ENDIAN
-	sum	psr.be		// switch back to big-endian mode
-#endif
 	mov	ar.pfs = PFSSave
 	br.ret.sptk.few	rp
 } ;;
@@ -1001,9 +928,99 @@
 	nop 0x0 ;							\
 	nop 0x0 ;							\
 	br.cond.sptk.many md5_digest_GHI ;				\
-} ;									\
+} ;;									\
 	.endp md5digestBlock ## offset
 
 MD5FBLOCK(1)
 MD5FBLOCK(2)
 MD5FBLOCK(3)
+
+	.align 64
+	.type md5_constants, @object
+md5_constants:
+.md5_tbl_data_order:			// To ensure little-endian data
+					// order, code as bytes.
+	data1 0x78, 0xa4, 0x6a, 0xd7	//     0
+	data1 0x56, 0xb7, 0xc7, 0xe8	//     1
+	data1 0xdb, 0x70, 0x20, 0x24	//     2
+	data1 0xee, 0xce, 0xbd, 0xc1	//     3
+	data1 0xaf, 0x0f, 0x7c, 0xf5	//     4
+	data1 0x2a, 0xc6, 0x87, 0x47	//     5
+	data1 0x13, 0x46, 0x30, 0xa8	//     6
+	data1 0x01, 0x95, 0x46, 0xfd	//     7
+	data1 0xd8, 0x98, 0x80, 0x69	//     8
+	data1 0xaf, 0xf7, 0x44, 0x8b	//     9
+	data1 0xb1, 0x5b, 0xff, 0xff	//    10
+	data1 0xbe, 0xd7, 0x5c, 0x89	//    11
+	data1 0x22, 0x11, 0x90, 0x6b	//    12
+	data1 0x93, 0x71, 0x98, 0xfd	//    13
+	data1 0x8e, 0x43, 0x79, 0xa6	//    14
+	data1 0x21, 0x08, 0xb4, 0x49	//    15
+	data1 0x62, 0x25, 0x1e, 0xf6	//    16
+	data1 0x40, 0xb3, 0x40, 0xc0	//    17
+	data1 0x51, 0x5a, 0x5e, 0x26	//    18
+	data1 0xaa, 0xc7, 0xb6, 0xe9	//    19
+	data1 0x5d, 0x10, 0x2f, 0xd6	//    20
+	data1 0x53, 0x14, 0x44, 0x02	//    21
+	data1 0x81, 0xe6, 0xa1, 0xd8	//    22
+	data1 0xc8, 0xfb, 0xd3, 0xe7	//    23
+	data1 0xe6, 0xcd, 0xe1, 0x21	//    24
+	data1 0xd6, 0x07, 0x37, 0xc3	//    25
+	data1 0x87, 0x0d, 0xd5, 0xf4	//    26
+	data1 0xed, 0x14, 0x5a, 0x45	//    27
+	data1 0x05, 0xe9, 0xe3, 0xa9	//    28
+	data1 0xf8, 0xa3, 0xef, 0xfc	//    29
+	data1 0xd9, 0x02, 0x6f, 0x67	//    30
+	data1 0x8a, 0x4c, 0x2a, 0x8d	//    31
+	data1 0x42, 0x39, 0xfa, 0xff	//    32
+	data1 0x81, 0xf6, 0x71, 0x87	//    33
+	data1 0x22, 0x61, 0x9d, 0x6d	//    34
+	data1 0x0c, 0x38, 0xe5, 0xfd	//    35
+	data1 0x44, 0xea, 0xbe, 0xa4	//    36
+	data1 0xa9, 0xcf, 0xde, 0x4b	//    37
+	data1 0x60, 0x4b, 0xbb, 0xf6	//    38
+	data1 0x70, 0xbc, 0xbf, 0xbe	//    39
+	data1 0xc6, 0x7e, 0x9b, 0x28	//    40
+	data1 0xfa, 0x27, 0xa1, 0xea	//    41
+	data1 0x85, 0x30, 0xef, 0xd4	//    42
+	data1 0x05, 0x1d, 0x88, 0x04	//    43
+	data1 0x39, 0xd0, 0xd4, 0xd9	//    44
+	data1 0xe5, 0x99, 0xdb, 0xe6	//    45
+	data1 0xf8, 0x7c, 0xa2, 0x1f	//    46
+	data1 0x65, 0x56, 0xac, 0xc4	//    47
+	data1 0x44, 0x22, 0x29, 0xf4	//    48
+	data1 0x97, 0xff, 0x2a, 0x43	//    49
+	data1 0xa7, 0x23, 0x94, 0xab	//    50
+	data1 0x39, 0xa0, 0x93, 0xfc	//    51
+	data1 0xc3, 0x59, 0x5b, 0x65	//    52
+	data1 0x92, 0xcc, 0x0c, 0x8f	//    53
+	data1 0x7d, 0xf4, 0xef, 0xff	//    54
+	data1 0xd1, 0x5d, 0x84, 0x85	//    55
+	data1 0x4f, 0x7e, 0xa8, 0x6f	//    56
+	data1 0xe0, 0xe6, 0x2c, 0xfe	//    57
+	data1 0x14, 0x43, 0x01, 0xa3	//    58
+	data1 0xa1, 0x11, 0x08, 0x4e	//    59
+	data1 0x82, 0x7e, 0x53, 0xf7	//    60
+	data1 0x35, 0xf2, 0x3a, 0xbd	//    61
+	data1 0xbb, 0xd2, 0xd7, 0x2a	//    62
+	data1 0x91, 0xd3, 0x86, 0xeb	//    63
+
+.md5_tbl_host_order:			// OS data order, might as well
+					// be little-endian.
+	data4 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee	// 0
+	data4 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501	// 4
+	data4 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be	// 8
+	data4 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821	// 12
+	data4 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa	// 16
+	data4 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8	// 20
+	data4 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed	// 24
+	data4 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a	// 28
+	data4 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c	// 32
+	data4 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70	// 36
+	data4 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05	// 40
+	data4 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665	// 44
+	data4 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039	// 48
+	data4 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1	// 52
+	data4 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1	// 56
+	data4 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391	// 60
+.size	md5_constants#,64*4*2