crypto/modes/modes_lcl.h - third_party/openssl - Git at Google

 /* ====================================================================
  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use is governed by OpenSSL license.
  * ====================================================================
  */

 #include <openssl/modes.h>


 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
 typedef __int64 i64;
 typedef unsigned __int64 u64;
 #define U64(C) C##UI64
 #elif defined(__arch64__)
 typedef long i64;
 typedef unsigned long u64;
 #define U64(C) C##UL
 #else
 typedef long long i64;
 typedef unsigned long long u64;
 #define U64(C) C##ULL
 #endif

 typedef unsigned int u32;
 typedef unsigned char u8;

 #define STRICT_ALIGNMENT 1
 #if defined(__i386)	|| defined(__i386__)	|| \
     defined(__x86_64)	|| defined(__x86_64__)	|| \
     defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
     defined(__s390__)	|| defined(__s390x__)
 # undef STRICT_ALIGNMENT
 #endif

 #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPNESSL_NO_INLINE_ASM)
 #if defined(__GNUC__) && __GNUC__>=2
 # if defined(__x86_64) || defined(__x86_64__)
 #  define BSWAP8(x) ({	u64 ret=(x);			\
 			asm volatile ("bswapq %0"	\
 			: "+r"(ret));	ret;		})
 #  define BSWAP4(x) ({	u32 ret=(x);			\
 			asm volatile ("bswapl %0"	\
 			: "+r"(ret));	ret;		})
 # elif (defined(__i386) || defined(__i386__))
 #  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
 			asm volatile ("bswapl %0; bswapl %1"	\
 			: "+r"(hi),"+r"(lo));		\
 			(u64)hi<<32|lo;			})
 #  define BSWAP4(x) ({	u32 ret=(x);			\
 			asm volatile ("bswapl %0"	\
 			: "+r"(ret));	ret;		})
 # endif
 #elif defined(_MSC_VER)
 # if _MSC_VER>=1300
 #  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
 #  define BSWAP8(x)	_byteswap_uint64((u64)(x))
 #  define BSWAP4(x)	_byteswap_ulong((u32)(x))
 # elif defined(_M_IX86)
    __inline u32 _bswap4(u32 val) {
 	_asm mov eax,val
 	_asm bswap eax
    }
 #  define BSWAP4(x)	_bswap4(x)
 # endif
 #endif
 #endif

 #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
 #define GETU32(p)	BSWAP4(*(const u32 *)(p))
 #define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
 #else
 #define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
 #define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
 #endif

 /* GCM definitions */

 typedef struct { u64 hi,lo; } u128;

 #ifdef	TABLE_BITS
 #undef	TABLE_BITS
 #endif
 /*
  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  * never be set to 8. 8 is effectively reserved for testing purposes.
  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  * whole spectrum of possible table driven implementations. Why? In
  * non-"Shoup's" case memory access pattern is segmented in such manner,
  * that it's trivial to see that cache timing information can reveal
  * fair portion of intermediate hash value. Given that ciphertext is
  * always available to attacker, it's possible for him to attempt to
  * deduce secret parameter H and if successful, tamper with messages
  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  * not as trivial, but there is no reason to believe that it's resistant
  * to cache-timing attack. And the thing about "8-bit" implementation is
  * that it consumes 16 (sixteen) times more memory, 4KB per individual
  * key + 1KB shared. Well, on pros side it should be twice as fast as
  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
  * was observed to run ~75% faster, closer to 100% for commercial
  * compilers... Yet "4-bit" procedure is preferred, because it's
  * believed to provide better security-performance balance and adequate
  * all-round performance. "All-round" refers to things like:
  *
  * - shorter setup time effectively improves overall timing for
  *   handling short messages;
  * - larger table allocation can become unbearable because of VM
  *   subsystem penalties (for example on Windows large enough free
  *   results in VM working set trimming, meaning that consequent
  *   malloc would immediately incur working set expansion);
  * - larger table has larger cache footprint, which can affect
  *   performance of other code paths (not necessarily even from same
  *   thread in Hyper-Threading world);
  */
 #define	TABLE_BITS 4

 struct gcm128_context {
 	/* Following 6 names follow names in GCM specification */
 	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,
 						Xi,H,len;
 	/* Pre-computed table used by gcm_gmult_* */
 #if TABLE_BITS==8
 	u128 Htable[256];
 #else
 	u128 Htable[16];
 	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
 	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #endif
 	unsigned int mres, ares;
 	block128_f block;
 	void *key;
 };
	/* ====================================================================
	* Copyright (c) 2010 The OpenSSL Project. All rights reserved.
	*
	* Redistribution and use is governed by OpenSSL license.
	* ====================================================================
	*/

	#include <openssl/modes.h>


	#if (defined(_WIN32) \|\| defined(_WIN64)) && !defined(__MINGW32__)
	typedef __int64 i64;
	typedef unsigned __int64 u64;
	#define U64(C) C##UI64
	#elif defined(__arch64__)
	typedef long i64;
	typedef unsigned long u64;
	#define U64(C) C##UL
	#else
	typedef long long i64;
	typedef unsigned long long u64;
	#define U64(C) C##ULL
	#endif

	typedef unsigned int u32;
	typedef unsigned char u8;

	#define STRICT_ALIGNMENT 1
	#if defined(__i386) \|\| defined(__i386__) \|\| \
	defined(__x86_64) \|\| defined(__x86_64__) \|\| \
	defined(_M_IX86) \|\| defined(_M_AMD64) \|\| defined(_M_X64) \|\| \
	defined(__s390__) \|\| defined(__s390x__)
	# undef STRICT_ALIGNMENT
	#endif

	#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPNESSL_NO_INLINE_ASM)
	#if defined(__GNUC__) && __GNUC__>=2
	# if defined(__x86_64) \|\| defined(__x86_64__)
	# define BSWAP8(x) ({ u64 ret=(x); \
	asm volatile ("bswapq %0" \
	: "+r"(ret)); ret; })
	# define BSWAP4(x) ({ u32 ret=(x); \
	asm volatile ("bswapl %0" \
	: "+r"(ret)); ret; })
	# elif (defined(__i386) \|\| defined(__i386__))
	# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
	asm volatile ("bswapl %0; bswapl %1" \
	: "+r"(hi),"+r"(lo)); \
	(u64)hi<<32\|lo; })
	# define BSWAP4(x) ({ u32 ret=(x); \
	asm volatile ("bswapl %0" \
	: "+r"(ret)); ret; })
	# endif
	#elif defined(_MSC_VER)
	# if _MSC_VER>=1300
	# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
	# define BSWAP8(x) _byteswap_uint64((u64)(x))
	# define BSWAP4(x) _byteswap_ulong((u32)(x))
	# elif defined(_M_IX86)
	__inline u32 _bswap4(u32 val) {
	_asm mov eax,val
	_asm bswap eax
	}
	# define BSWAP4(x) _bswap4(x)
	# endif
	#endif
	#endif

	#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
	#define GETU32(p) BSWAP4((const u32 )(p))
	#define PUTU32(p,v) (u32 )(p) = BSWAP4(v)
	#else
	#define GETU32(p) ((u32)(p)[0]<<24\|(u32)(p)[1]<<16\|(u32)(p)[2]<<8\|(u32)(p)[3])
	#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
	#endif

	/* GCM definitions */

	typedef struct { u64 hi,lo; } u128;

	#ifdef TABLE_BITS
	#undef TABLE_BITS
	#endif
	/*
	* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
	* never be set to 8. 8 is effectively reserved for testing purposes.
	* TABLE_BITS>1 are lookup-table-driven implementations referred to as
	* "Shoup's" in GCM specification. In other words OpenSSL does not cover
	* whole spectrum of possible table driven implementations. Why? In
	* non-"Shoup's" case memory access pattern is segmented in such manner,
	* that it's trivial to see that cache timing information can reveal
	* fair portion of intermediate hash value. Given that ciphertext is
	* always available to attacker, it's possible for him to attempt to
	* deduce secret parameter H and if successful, tamper with messages
	* [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
	* not as trivial, but there is no reason to believe that it's resistant
	* to cache-timing attack. And the thing about "8-bit" implementation is
	* that it consumes 16 (sixteen) times more memory, 4KB per individual
	* key + 1KB shared. Well, on pros side it should be twice as fast as
	* "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
	* was observed to run ~75% faster, closer to 100% for commercial
	* compilers... Yet "4-bit" procedure is preferred, because it's
	* believed to provide better security-performance balance and adequate
	* all-round performance. "All-round" refers to things like:
	*
	* - shorter setup time effectively improves overall timing for
	* handling short messages;
	* - larger table allocation can become unbearable because of VM
	* subsystem penalties (for example on Windows large enough free
	* results in VM working set trimming, meaning that consequent
	* malloc would immediately incur working set expansion);
	* - larger table has larger cache footprint, which can affect
	* performance of other code paths (not necessarily even from same
	* thread in Hyper-Threading world);
	*/
	#define TABLE_BITS 4

	struct gcm128_context {
	/* Following 6 names follow names in GCM specification */
	union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
	Xi,H,len;
	/* Pre-computed table used by gcm_gmult_* */
	#if TABLE_BITS==8
	u128 Htable[256];
	#else
	u128 Htable[16];
	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
	void (ghash)(u64 Xi[2],const u128 Htable[16],const u8 inp,size_t len);
	#endif
	unsigned int mres, ares;
	block128_f block;
	void *key;
	};