diff --git a/JHA/.deps/.dirstamp b/JHA/.deps/.dirstamp new file mode 100644 index 0000000..e69de29 diff --git a/JHA/.dirstamp b/JHA/.dirstamp new file mode 100644 index 0000000..e69de29 diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu new file mode 100644 index 0000000..c59e84e --- /dev/null +++ b/JHA/cuda_jha_keccak512.cu @@ -0,0 +1,572 @@ +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +__constant__ uint64_t c_State[25]; +__constant__ uint32_t c_PaddedMessage[18]; + +static __device__ uint32_t cuda_swab32(uint32_t x) +{ + return __byte_perm(x, 0, 0x0123); +} + +// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt +#if __CUDA_ARCH__ >= 350 +__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#else +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +#define U32TO64_LE(p) \ + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + +#define U64TO32_LE(p, v) \ + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint64_t c_keccak_round_constants[24]; + +static __device__ __forceinline__ void +keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ +#pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +__global__ void jackpot_keccak512_gpu_hash_88(int threads, uint32_t startNounce, uint64_t *g_hash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + + int hashPosition = nounce - startNounce; + + // Nachricht kopieren + uint32_t message[18]; +#pragma unroll 18 + for(int i=0;i<18;i++) + message[i] = c_PaddedMessage[i]; + + // die individuelle Nounce einsetzen + message[1] = cuda_swab32(nounce); + + // State initialisieren + uint64_t keccak_gpu_state[25]; +#pragma unroll 25 + for (int i=0; i<25; i++) + keccak_gpu_state[i] = c_State[i]; + + // den Block einmal gut durchschütteln + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + // das Hash erzeugen + uint32_t hash[16]; + +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); + } + + // fertig + uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; + +#pragma unroll 16 + for(int i=0;i<16;i++) + outpHash[i] = hash[i]; + } +} + +// Setup-Funktionen +__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice); +} + +#define cKeccakB 1600 +#define cKeccakR 576 + +#define cKeccakR_SizeInBytes (cKeccakR / 8) +#define crypto_hash_BYTES 64 + +#if (cKeccakB == 1600) + typedef unsigned long long UINT64; + typedef UINT64 tKeccakLane; + #define cKeccakNumberOfRounds 24 +#endif + +#define cKeccakLaneSizeInBits (sizeof(tKeccakLane) * 8) + +#define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits)))) +#if ((cKeccakB/25) == 8) + #define ROL_mult8(a, offset) ((tKeccakLane)a) +#else + #define ROL_mult8(a, offset) ROL(a, offset) +#endif +void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ); + +const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = +{ + (tKeccakLane)0x0000000000000001ULL, + (tKeccakLane)0x0000000000008082ULL, + (tKeccakLane)0x800000000000808aULL, + (tKeccakLane)0x8000000080008000ULL, + (tKeccakLane)0x000000000000808bULL, + (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008009ULL, + (tKeccakLane)0x000000000000008aULL, + (tKeccakLane)0x0000000000000088ULL, + (tKeccakLane)0x0000000080008009ULL, + (tKeccakLane)0x000000008000000aULL, + (tKeccakLane)0x000000008000808bULL, + (tKeccakLane)0x800000000000008bULL, + (tKeccakLane)0x8000000000008089ULL, + (tKeccakLane)0x8000000000008003ULL, + (tKeccakLane)0x8000000000008002ULL, + (tKeccakLane)0x8000000000000080ULL + #if (cKeccakB >= 400) + , (tKeccakLane)0x000000000000800aULL, + (tKeccakLane)0x800000008000000aULL + #if (cKeccakB >= 800) + , (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008080ULL + #if (cKeccakB == 1600) + , (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008008ULL + #endif + #endif + #endif +}; + +void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) +{ + + { + while ( --laneCount >= 0 ) + { + state[laneCount] ^= in[laneCount]; + } + } + + { + tKeccakLane Aba, Abe, Abi, Abo, Abu; + tKeccakLane Aga, Age, Agi, Ago, Agu; + tKeccakLane Aka, Ake, Aki, Ako, Aku; + tKeccakLane Ama, Ame, Ami, Amo, Amu; + tKeccakLane Asa, Ase, Asi, Aso, Asu; + tKeccakLane BCa, BCe, BCi, BCo, BCu; + tKeccakLane Da, De, Di, Do, Du; + tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu; + tKeccakLane Ega, Ege, Egi, Ego, Egu; + tKeccakLane Eka, Eke, Eki, Eko, Eku; + tKeccakLane Ema, Eme, Emi, Emo, Emu; + tKeccakLane Esa, Ese, Esi, Eso, Esu; + #define round laneCount + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < cKeccakNumberOfRounds; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (tKeccakLane)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL_mult8(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL_mult8(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL_mult8(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL_mult8(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round + } +} + +__host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata) +{ + unsigned long long inlen = 88; + const unsigned char *in = (const unsigned char*)pdata; + + tKeccakLane state[5 * 5]; + unsigned char temp[cKeccakR_SizeInBytes]; + + memset( state, 0, sizeof(state) ); + + for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes ) + { + KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); + } + + // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) + // ins Constant Memory + cudaMemcpyToSymbol( c_State, + state, + sizeof(state), + 0, cudaMemcpyHostToDevice); + + // padding + memcpy( temp, in, (size_t)inlen ); + temp[inlen++] = 1; + memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen ); + temp[cKeccakR_SizeInBytes-1] |= 0x80; + + + // Kopiere den Rest der Message und das Padding ins Constant Memory + cudaMemcpyToSymbol( c_PaddedMessage, + temp, + cKeccakR_SizeInBytes, + 0, cudaMemcpyHostToDevice); +} + +__host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) +{ + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + jackpot_keccak512_gpu_hash_88<<>>(threads, startNounce, (uint64_t*)d_hash); + MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu new file mode 100644 index 0000000..0802eb2 --- /dev/null +++ b/JHA/jackpotcoin.cu @@ -0,0 +1,173 @@ + +extern "C" +{ +#include "sph/sph_keccak.h" +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_jh.h" +#include "sph/sph_skein.h" +} + +#include "miner.h" +#include + +// aus cpu-miner.c +extern int device_map[8]; +extern bool opt_benchmark; + +// Speicher für Input/Output der verketteten Hashfunktionen +static uint32_t *d_hash[8]; + +extern void jackpot_keccak512_cpu_init(int thr_id, int threads); +extern void jackpot_keccak512_cpu_setBlock_88(void *pdata); +extern void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void quark_check_cpu_init(int thr_id, int threads); +extern void quark_check_cpu_setTarget(const void *ptarget); +extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + +// Original jackpothash Funktion aus einem miner Quelltext +inline unsigned int jackpothash(void *state, const void *input) +{ + sph_blake512_context ctx_blake; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + uint32_t hash[16]; + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, input, 88); + sph_keccak512_close(&ctx_keccak, hash); + + unsigned int round_mask = ( + (unsigned int)(((unsigned char *)input)[84]) << 0 | + (unsigned int)(((unsigned char *)input)[85]) << 8 | + (unsigned int)(((unsigned char *)input)[86]) << 16 | + (unsigned int)(((unsigned char *)input)[87]) << 24 ); + unsigned int round_max = hash[0] & round_mask; + unsigned int round; + for (round = 0; round < round_max; round++) { + switch (hash[0] & 3) { + case 0: + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, hash, 64); + sph_blake512_close(&ctx_blake, hash); + break; + case 1: + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, hash, 64); + sph_groestl512_close(&ctx_groestl, hash); + break; + case 2: + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, hash, 64); + sph_jh512_close(&ctx_jh, hash); + break; + case 3: + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash); + break; + } + } + memcpy(state, hash, 32); + + return round_max; +} + + +static int bit_population(uint32_t n){ + int c =0; + while(n){ + c += n&1; + n = n>>1; + } + return c; +} + +extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + + // TODO: entfernen für eine Release! Ist nur zum Testen! + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x00000f; + ((uint32_t*)pdata)[21] = 0x07000000; // round_mask von 7 vorgeben + } + + const uint32_t Htarg = ptarget[7]; + + const int throughput = 256*4096; // 100; + + static bool init[8] = {0,0,0,0,0,0,0,0}; + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + // Konstanten kopieren, Speicher belegen + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + jackpot_keccak512_cpu_init(thr_id, throughput); + quark_check_cpu_init(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[22]; + for (int k=0; k < 22; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + unsigned int round_mask = ( + (unsigned int)(((unsigned char *)endiandata)[84]) << 0 | + (unsigned int)(((unsigned char *)endiandata)[85]) << 8 | + (unsigned int)(((unsigned char *)endiandata)[86]) << 16 | + (unsigned int)(((unsigned char *)endiandata)[87]) << 24 ); + + // Zählen wie viele Bits in round_mask gesetzt sind + int bitcount = bit_population(round_mask); + + jackpot_keccak512_cpu_setBlock_88((void*)endiandata); + quark_check_cpu_setTarget(ptarget); + + do { + int order = 0; + + // erstes Blake512 Hash mit CUDA + jackpot_keccak512_cpu_hash_88(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + // TODO: hier fehlen jetzt natürlich noch die anderen Hashrunden. + // bei round_mask=7 haben wir eine 1:8 Chance, dass das Hash dennoch + // die Kriterien erfüllt wenn hash[0] & round_mask zufällig 0 ist. + + // Scan nach Gewinner Hashes auf der GPU + uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + + // diese jackpothash Funktion gibt die Zahl der zusätzlichen Runden zurück + unsigned int rounds = jackpothash(vhash64, endiandata); + + // wir akzeptieren nur solche Hashes wo ausschliesslich Keccak verwendet wurde + if (rounds == 0) { + if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { + + pdata[19] = foundNonce; + *hashes_done = (foundNonce - first_nonce + 1) / (1 << bitcount); + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); + } + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = (pdata[19] - first_nonce + 1) / (1 << bitcount); + return 0; +} diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu new file mode 100644 index 0000000..fb85a24 --- /dev/null +++ b/cuda_myriadgroestl.cu @@ -0,0 +1,622 @@ +// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice + +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// it's unfortunate that this is a compile time constant. +#define MAXWELL_OR_FERMI 0 + +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props; + +// globaler Speicher für alle HeftyHashes aller Threads +__constant__ uint32_t pTarget[8]; // Single GPU +extern uint32_t *d_resultNonce[8]; + +__constant__ uint32_t myriadgroestl_gpu_msg[32]; + +// muss expandiert werden +__constant__ uint32_t myr_sha256_gpu_constantTable[64]; +__constant__ uint32_t myr_sha256_gpu_hashTable[8]; + +uint32_t myr_sha256_cpu_hashTable[] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; +uint32_t myr_sha256_cpu_constantTable[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else + // Kepler (Compute 3.5) + #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#endif +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) +#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + +__device__ void myriadgroestl_gpu_sha256(uint32_t *message) +{ + uint32_t W1[16]; + uint32_t W2[16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + + // pre +#pragma unroll 8 + for (int k=0; k < 8; k++) + { + regs[k] = myr_sha256_gpu_hashTable[k]; + hash[k] = regs[k]; + } + +#pragma unroll 16 + for(int k=0;k<16;k++) + W1[k] = SWAB32(message[k]); + +// Progress W1 +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +// Progress W2...W3 +#pragma unroll 3 + for(int k=0;k<3;k++) + { +#pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; +#pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + +#pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Rundenfunktion +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +#pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + ///// + ///// Zweite Runde (wegen Msg-Padding) + ///// +#pragma unroll 8 + for(int k=0;k<8;k++) + regs[k] = hash[k]; + + W1[0] = SWAB32(0x80); +#pragma unroll 14 + for(int k=1;k<15;k++) + W1[k] = 0; + W1[15] = 512; + +// Progress W1 +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +// Progress W2...W3 +#pragma unroll 3 + for(int k=0;k<3;k++) + { +#pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; +#pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + +#pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Rundenfunktion +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +#pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + //// FERTIG + +#pragma unroll 8 + for(int k=0;k<8;k++) + message[k] = SWAB32(hash[k]); +} + +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define PC32up(j, r) ((uint32_t)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) 0xFFFFFFFF +#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) + +#define B32_0(x) __byte_perm(x, 0, 0x4440) +//((x) & 0xFF) +#define B32_1(x) __byte_perm(x, 0, 0x4441) +//(((x) >> 8) & 0xFF) +#define B32_2(x) __byte_perm(x, 0, 0x4442) +//(((x) >> 16) & 0xFF) +#define B32_3(x) __byte_perm(x, 0, 0x4443) +//((x) >> 24) + +#if MAXWELL_OR_FEMRI +#define USE_SHARED 1 +// Maxwell and Fermi cards get the best speed with SHARED access it seems. +#if USE_SHARED +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) +#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) +#else +#define T0up(x) tex1Dfetch(t0up1, x) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) tex1Dfetch(t1dn1, x) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) tex1Dfetch(t2dn1, x) +#define T3up(x) tex1Dfetch(t3up1, x) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif +#else +#define USE_SHARED 1 +// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif + +texture t0up1; +texture t0dn1; +texture t1up1; +texture t1dn1; +texture t2up1; +texture t2dn1; +texture t3up1; +texture t3dn1; + +extern uint32_t T0up_cpu[]; +extern uint32_t T0dn_cpu[]; +extern uint32_t T1up_cpu[]; +extern uint32_t T1dn_cpu[]; +extern uint32_t T2up_cpu[]; +extern uint32_t T2dn_cpu[]; +extern uint32_t T3up_cpu[]; +extern uint32_t T3dn_cpu[]; + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + + +__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs) +{ + uint32_t t[32]; + +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + switch(r) + { + case 0: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; + case 1: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; + case 2: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; + case 3: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; + case 4: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; + case 5: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; + case 6: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; + case 7: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; + case 8: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; + case 9: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; + case 10: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; + case 11: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; + case 12: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; + case 13: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); + uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); + uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); + uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); + + t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ + T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); + + t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ + T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs) +{ +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + uint32_t t[32]; + + switch(r) + { + case 0: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; + case 1: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; + case 2: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; + case 3: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; + case 4: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; + case 5: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; + case 6: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; + case 7: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; + case 8: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; + case 9: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; + case 10: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; + case 11: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; + case 12: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; + case 13: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); + uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); + uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); + uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); + + t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ + T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); + + t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ + T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__global__ void +myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) +{ +#if USE_SHARED + extern __shared__ char mixtabs[]; + + if (threadIdx.x < 256) + { + *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); + *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); + *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); + *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); + *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); + *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); + } + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // GROESTL + uint32_t message[32]; + uint32_t state[32]; + +#pragma unroll 32 + for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k]; + + uint32_t nounce = startNounce + thread; + message[19] = SWAB32(nounce); + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] = message[u]; + state[31] ^= 0x20000; + + // Perm +#if USE_SHARED + myriadgroestl_perm_P(state, mixtabs); + state[31] ^= 0x20000; + myriadgroestl_perm_Q(message, mixtabs); +#else + myriadgroestl_perm_P(state, NULL); + state[31] ^= 0x20000; + myriadgroestl_perm_Q(message, NULL); +#endif +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; + +#if USE_SHARED + myriadgroestl_perm_P(message, mixtabs); +#else + myriadgroestl_perm_P(message, NULL); +#endif + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + + uint32_t out_state[16]; +#pragma unroll 16 + for(int u=0;u<16;u++) out_state[u] = state[u+16]; + myriadgroestl_gpu_sha256(out_state); + + int i, position = -1; + bool rc = true; + +#pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (out_state[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (out_state[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + +// Setup-Funktionen +__host__ void myriadgroestl_cpu_init(int thr_id, int threads) +{ + cudaSetDevice(device_map[thr_id]); + + cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, + myr_sha256_cpu_hashTable, + sizeof(uint32_t) * 8 ); + + cudaMemcpyToSymbol( myr_sha256_gpu_constantTable, + myr_sha256_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + cudaGetDeviceProperties(&props, device_map[thr_id]); + + // Texturen mit obigem Makro initialisieren + texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + + // Speicher für Gewinner-Nonce belegen + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); +} + +__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 80); + + // Erweitere die Nachricht auf den Nachrichtenblock (padding) + // Unsere Nachricht hat 80 Byte + msgBlock[20] = 0x80; + msgBlock[31] = 0x01000000; + + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) + + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( myriadgroestl_gpu_msg, + msgBlock, + 128); + + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + cudaMemcpyToSymbol( pTarget, + pTargetIn, + sizeof(uint32_t) * 8 ); +} + +__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +{ + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, + // alle anderen mit 512 Threads. + int threadsperblock = (props.major >= 3) ? 768 : 512; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs +#if USE_SHARED + size_t shared_size = 8 * 256 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + //fprintf(stderr, "ThrID: %d\n", thr_id); + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + myriadgroestl_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); + + cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); +} diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp new file mode 100644 index 0000000..4b2a231 --- /dev/null +++ b/myriadgroestl.cpp @@ -0,0 +1,106 @@ +#include "uint256.h" +#include "sph/sph_groestl.h" + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +extern bool opt_benchmark; + +void myriadgroestl_cpu_init(int thr_id, int threads); +void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn); +void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); + +#define SWAP32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +static void myriadhash(void *state, const void *input) +{ + sph_groestl512_context ctx_groestl; + + uint32_t hashA[16], hashB[16]; + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, input, 80); + sph_groestl512_close(&ctx_groestl, hashA); + + SHA256_CTX sha256; + SHA256_Init(&sha256); + SHA256_Update(&sha256,(unsigned char *)hashA, 64); + SHA256_Final((unsigned char *)hashB, &sha256); + memcpy(state, hashB, 32); +} + + + +extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t start_nonce = pdata[19]++; + const uint32_t throughPut = 128 * 1024; +// const uint32_t throughPut = 1; + uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); + + // TODO: entfernen für eine Release! Ist nur zum Testen! + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + const uint32_t Htarg = ptarget[7]; + + // init + static bool init[8] = { false, false, false, false, false, false, false, false }; + if(!init[thr_id]) + { +#if BIG_DEBUG +#else + myriadgroestl_cpu_init(thr_id, throughPut); +#endif + init[thr_id] = true; + } + + uint32_t endiandata[32]; + for (int kk=0; kk < 32; kk++) + be32enc(&endiandata[kk], pdata[kk]); + + // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) + myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget); + + do { + // GPU + uint32_t foundNounce = 0xFFFFFFFF; + + myriadgroestl_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); + + if(foundNounce < 0xffffffff) + { + uint32_t tmpHash[8]; + endiandata[19] = SWAP32(foundNounce); + myriadhash(tmpHash, endiandata); + if (tmpHash[7] <= Htarg && + fulltest(tmpHash, ptarget)) { + pdata[19] = foundNounce; + *hashes_done = foundNounce - start_nonce; + free(outputHash); + return true; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); + } + + foundNounce = 0xffffffff; + } + + if (pdata[19] + throughPut < pdata[19]) + pdata[19] = max_nonce; + else pdata[19] += throughPut; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - start_nonce; + free(outputHash); + return 0; +} + diff --git a/quark/.deps/.dirstamp b/quark/.deps/.dirstamp new file mode 100644 index 0000000..e69de29 diff --git a/quark/.dirstamp b/quark/.dirstamp new file mode 100644 index 0000000..e69de29 diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu new file mode 100644 index 0000000..b80da04 --- /dev/null +++ b/quark/cuda_quark_checkhash.cu @@ -0,0 +1,107 @@ +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// das Hash Target gegen das wir testen sollen +__constant__ uint32_t pTarget[8]; + +uint32_t *d_resNounce[8]; +uint32_t *h_resNounce[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +__global__ void quark_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *inpHash = &g_hash[16 * hashPosition]; + + uint32_t hash[8]; +#pragma unroll 8 + for (int i=0; i < 8; i++) + hash[i] = inpHash[i]; + + // kopiere Ergebnis + int i, position = -1; + bool rc = true; + +#pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (hash[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (hash[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +// Setup-Funktionen +__host__ void quark_check_cpu_init(int thr_id, int threads) +{ + cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); + cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); +} + +// Target Difficulty setzen +__host__ void quark_check_cpu_setTarget(const void *ptarget) +{ + // die Message zur Berechnung auf der GPU + cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} + +__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + quark_check_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, order, thr_id); + + // Ergebnis zum Host kopieren (in page locked memory, damits schneller geht) + cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + // cudaMemcpy() ist asynchron! + cudaThreadSynchronize(); + result = *h_resNounce[thr_id]; + + return result; +} diff --git a/sph/aes_helper.c b/sph/aes_helper.c new file mode 100644 index 0000000..75b7cc6 --- /dev/null +++ b/sph/aes_helper.c @@ -0,0 +1,392 @@ +/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */ +/* + * AES tables. This file is not meant to be compiled by itself; it + * is included by some hash function implementations. It contains + * the precomputed tables and helper macros for evaluating an AES + * round, optionally with a final XOR with a subkey. + * + * By default, this file defines the tables and macros for little-endian + * processing (i.e. it is assumed that the input bytes have been read + * from memory and assembled with the little-endian convention). If + * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value) + * when this file is included, then the tables and macros for big-endian + * processing are defined instead. The big-endian tables and macros have + * names distinct from the little-endian tables and macros, hence it is + * possible to have both simultaneously, by including this file twice + * (with and without the AES_BIG_ENDIAN macro). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include "sph_types.h" +#ifdef __cplusplus +extern "C"{ +#endif +#if AES_BIG_ENDIAN + +#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +#define AES0 AES0_BE +#define AES1 AES1_BE +#define AES2 AES2_BE +#define AES3 AES3_BE + +#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[((X0) >> 24) & 0xFF] \ + ^ AES1[((X1) >> 16) & 0xFF] \ + ^ AES2[((X2) >> 8) & 0xFF] \ + ^ AES3[(X3) & 0xFF] ^ (K0); \ + (Y1) = AES0[((X1) >> 24) & 0xFF] \ + ^ AES1[((X2) >> 16) & 0xFF] \ + ^ AES2[((X3) >> 8) & 0xFF] \ + ^ AES3[(X0) & 0xFF] ^ (K1); \ + (Y2) = AES0[((X2) >> 24) & 0xFF] \ + ^ AES1[((X3) >> 16) & 0xFF] \ + ^ AES2[((X0) >> 8) & 0xFF] \ + ^ AES3[(X1) & 0xFF] ^ (K2); \ + (Y3) = AES0[((X3) >> 24) & 0xFF] \ + ^ AES1[((X0) >> 16) & 0xFF] \ + ^ AES2[((X1) >> 8) & 0xFF] \ + ^ AES3[(X2) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#else + +#define AESx(x) SPH_C32(x) +#define AES0 AES0_LE +#define AES1 AES1_LE +#define AES2 AES2_LE +#define AES3 AES3_LE + +#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[(X0) & 0xFF] \ + ^ AES1[((X1) >> 8) & 0xFF] \ + ^ AES2[((X2) >> 16) & 0xFF] \ + ^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \ + (Y1) = AES0[(X1) & 0xFF] \ + ^ AES1[((X2) >> 8) & 0xFF] \ + ^ AES2[((X3) >> 16) & 0xFF] \ + ^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \ + (Y2) = AES0[(X2) & 0xFF] \ + ^ AES1[((X3) >> 8) & 0xFF] \ + ^ AES2[((X0) >> 16) & 0xFF] \ + ^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \ + (Y3) = AES0[(X3) & 0xFF] \ + ^ AES1[((X0) >> 8) & 0xFF] \ + ^ AES2[((X1) >> 16) & 0xFF] \ + ^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#endif + +/* + * The AES*[] tables allow us to perform a fast evaluation of an AES + * round; table AESi[] combines SubBytes for a byte at row i, and + * MixColumns for the column where that byte goes after ShiftRows. + */ + +static const sph_u32 AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +static const sph_u32 AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +static const sph_u32 AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +static const sph_u32 AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + +#ifdef __cplusplus +} +#endif diff --git a/blake.c b/sph/blake.c similarity index 100% rename from blake.c rename to sph/blake.c diff --git a/sph/bmw.c b/sph/bmw.c new file mode 100644 index 0000000..718191d --- /dev/null +++ b/sph/bmw.c @@ -0,0 +1,957 @@ +/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * BMW implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_bmw.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW +#define SPH_SMALL_FOOTPRINT_BMW 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0x00010203), SPH_C32(0x04050607), + SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F), + SPH_C32(0x10111213), SPH_C32(0x14151617), + SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F), + SPH_C32(0x20212223), SPH_C32(0x24252627), + SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F), + SPH_C32(0x30313233), SPH_C32(0x34353637), + SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0x40414243), SPH_C32(0x44454647), + SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F), + SPH_C32(0x50515253), SPH_C32(0x54555657), + SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F), + SPH_C32(0x60616263), SPH_C32(0x64656667), + SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F), + SPH_C32(0x70717273), SPH_C32(0x74757677), + SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) +}; + +#if SPH_64 + +static const sph_u64 IV384[] = { + SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F), + SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F), + SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F), + SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F), + SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F), + SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F), + SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F), + SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), + SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), + SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), + SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), + SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), + SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), + SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), + SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) +}; + +#endif + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define LPAR ( + +#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 +#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 +#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 +#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 +#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 +#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 +#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 +#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 +#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 +#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + +#define M16_16 0, 1, 3, 4, 7, 10, 11 +#define M16_17 1, 2, 4, 5, 8, 11, 12 +#define M16_18 2, 3, 5, 6, 9, 12, 13 +#define M16_19 3, 4, 6, 7, 10, 13, 14 +#define M16_20 4, 5, 7, 8, 11, 14, 15 +#define M16_21 5, 6, 8, 9, 12, 15, 16 +#define M16_22 6, 7, 9, 10, 13, 0, 1 +#define M16_23 7, 8, 10, 11, 14, 1, 2 +#define M16_24 8, 9, 11, 12, 15, 2, 3 +#define M16_25 9, 10, 12, 13, 0, 3, 4 +#define M16_26 10, 11, 13, 14, 1, 4, 5 +#define M16_27 11, 12, 14, 15, 2, 5, 6 +#define M16_28 12, 13, 15, 16, 3, 6, 7 +#define M16_29 13, 14, 0, 1, 4, 7, 8 +#define M16_30 14, 15, 1, 2, 5, 8, 9 +#define M16_31 15, 16, 2, 3, 6, 9, 10 + +#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ + ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) +#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) +#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ + ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) +#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) +#define ss4(x) (((x) >> 1) ^ (x)) +#define ss5(x) (((x) >> 2) ^ (x)) +#define rs1(x) SPH_ROTL32(x, 3) +#define rs2(x) SPH_ROTL32(x, 7) +#define rs3(x) SPH_ROTL32(x, 13) +#define rs4(x) SPH_ROTL32(x, 16) +#define rs5(x) SPH_ROTL32(x, 19) +#define rs6(x) SPH_ROTL32(x, 23) +#define rs7(x) SPH_ROTL32(x, 27) + +#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) + +#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ + - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) + +#define expand1s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ + + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ + + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ + + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1s(qf, mf, hf, i16) \ + expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1s_(qf, mf, hf, i16, ix, iy) \ + expand1s_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ + + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ + + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ + + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2s(qf, mf, hf, i16) \ + expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2s_(qf, mf, hf, i16, ix, iy) \ + expand2s_inner LPAR qf, mf, hf, i16, ix, iy) + +#if SPH_64 + +#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ + ^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37)) +#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43)) +#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \ + ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53)) +#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59)) +#define sb4(x) (((x) >> 1) ^ (x)) +#define sb5(x) (((x) >> 2) ^ (x)) +#define rb1(x) SPH_ROTL64(x, 5) +#define rb2(x) SPH_ROTL64(x, 11) +#define rb3(x) SPH_ROTL64(x, 27) +#define rb4(x) SPH_ROTL64(x, 32) +#define rb5(x) SPH_ROTL64(x, 37) +#define rb6(x) SPH_ROTL64(x, 43) +#define rb7(x) SPH_ROTL64(x, 53) + +#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555)) + +#if SPH_SMALL_FOOTPRINT_BMW + +static const sph_u64 Kb_tab[] = { + Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23), + Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31) +}; + +#define rol_off(mf, j, off) \ + SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1) + +#define add_elt_b(mf, hf, j) \ + (SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \ + - rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15)) + +#define expand1b(qf, mf, hf, i) \ + SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \ + + sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \ + + sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \ + + sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \ + + sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \ + + sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \ + + sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \ + + sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#define expand2b(qf, mf, hf, i) \ + SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \ + + qf((i) - 14) + rb2(qf((i) - 13)) \ + + qf((i) - 12) + rb3(qf((i) - 11)) \ + + qf((i) - 10) + rb4(qf((i) - 9)) \ + + qf((i) - 8) + rb5(qf((i) - 7)) \ + + qf((i) - 6) + rb6(qf((i) - 5)) \ + + qf((i) - 4) + rb7(qf((i) - 3)) \ + + sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#else + +#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \ + - SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m)) + +#define expand1b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \ + + sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \ + + sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \ + + sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1b(qf, mf, hf, i16) \ + expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1b_(qf, mf, hf, i16, ix, iy) \ + expand1b_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \ + + qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \ + + qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \ + + qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2b(qf, mf, hf, i16) \ + expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2b_(qf, mf, hf, i16, ix, iy) \ + expand2b_inner LPAR qf, mf, hf, i16, ix, iy) + +#endif + +#endif + +#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ + tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ + op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) + +#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) +#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) +#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) +#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) +#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) +#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) +#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) +#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) +#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) +#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) +#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) +#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) +#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) +#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) +#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) +#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qas do { \ + unsigned u; \ + sph_u32 Ws[16]; \ + Ws[ 0] = Ws0; \ + Ws[ 1] = Ws1; \ + Ws[ 2] = Ws2; \ + Ws[ 3] = Ws3; \ + Ws[ 4] = Ws4; \ + Ws[ 5] = Ws5; \ + Ws[ 6] = Ws6; \ + Ws[ 7] = Ws7; \ + Ws[ 8] = Ws8; \ + Ws[ 9] = Ws9; \ + Ws[10] = Ws10; \ + Ws[11] = Ws11; \ + Ws[12] = Ws12; \ + Ws[13] = Ws13; \ + Ws[14] = Ws14; \ + Ws[15] = Ws15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#else + +#define MAKE_Qas do { \ + qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ + qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ + qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ + qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ + qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ + qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ + qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ + qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ + qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ + qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ + qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ + qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ + qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ + qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ + qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ + qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qs do { \ + MAKE_Qas; \ + MAKE_Qbs; \ + } while (0) + +#define Qs(j) (qt[j]) + +#if SPH_64 + +#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) +#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15) +#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15) +#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13) +#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14) +#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15) +#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13) +#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14) +#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15) +#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14) +#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15) +#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9) +#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10) +#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11) +#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12) +#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qab do { \ + unsigned u; \ + sph_u64 Wb[16]; \ + Wb[ 0] = Wb0; \ + Wb[ 1] = Wb1; \ + Wb[ 2] = Wb2; \ + Wb[ 3] = Wb3; \ + Wb[ 4] = Wb4; \ + Wb[ 5] = Wb5; \ + Wb[ 6] = Wb6; \ + Wb[ 7] = Wb7; \ + Wb[ 8] = Wb8; \ + Wb[ 9] = Wb9; \ + Wb[10] = Wb10; \ + Wb[11] = Wb11; \ + Wb[12] = Wb12; \ + Wb[13] = Wb13; \ + Wb[14] = Wb14; \ + Wb[15] = Wb15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbb do { \ + unsigned u; \ + for (u = 16; u < 18; u ++) \ + qt[u] = expand1b(Qb, M, H, u); \ + for (u = 18; u < 32; u ++) \ + qt[u] = expand2b(Qb, M, H, u); \ + } while (0) + +#else + +#define MAKE_Qab do { \ + qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \ + qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \ + qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \ + qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \ + qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \ + qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \ + qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \ + qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \ + qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \ + qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \ + qt[10] = SPH_T64(sb0(Wb10) + H(11)); \ + qt[11] = SPH_T64(sb1(Wb11) + H(12)); \ + qt[12] = SPH_T64(sb2(Wb12) + H(13)); \ + qt[13] = SPH_T64(sb3(Wb13) + H(14)); \ + qt[14] = SPH_T64(sb4(Wb14) + H(15)); \ + qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \ + } while (0) + +#define MAKE_Qbb do { \ + qt[16] = expand1b(Qb, M, H, 16); \ + qt[17] = expand1b(Qb, M, H, 17); \ + qt[18] = expand2b(Qb, M, H, 18); \ + qt[19] = expand2b(Qb, M, H, 19); \ + qt[20] = expand2b(Qb, M, H, 20); \ + qt[21] = expand2b(Qb, M, H, 21); \ + qt[22] = expand2b(Qb, M, H, 22); \ + qt[23] = expand2b(Qb, M, H, 23); \ + qt[24] = expand2b(Qb, M, H, 24); \ + qt[25] = expand2b(Qb, M, H, 25); \ + qt[26] = expand2b(Qb, M, H, 26); \ + qt[27] = expand2b(Qb, M, H, 27); \ + qt[28] = expand2b(Qb, M, H, 28); \ + qt[29] = expand2b(Qb, M, H, 29); \ + qt[30] = expand2b(Qb, M, H, 30); \ + qt[31] = expand2b(Qb, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qb do { \ + MAKE_Qab; \ + MAKE_Qbb; \ + } while (0) + +#define Qb(j) (qt[j]) + +#endif + +#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \ + type qt[32], xl, xh; \ + mkQ; \ + xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \ + ^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \ + xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \ + ^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \ + dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \ + + (xl ^ qf(24) ^ qf( 0))); \ + dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \ + + (xl ^ qf(25) ^ qf( 1))); \ + dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \ + + (xl ^ qf(26) ^ qf( 2))); \ + dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \ + + (xl ^ qf(27) ^ qf( 3))); \ + dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \ + + (xl ^ qf(28) ^ qf( 4))); \ + dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \ + + (xl ^ qf(29) ^ qf( 5))); \ + dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \ + + (xl ^ qf(30) ^ qf( 6))); \ + dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \ + + (xl ^ qf(31) ^ qf( 7))); \ + dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \ + + ((xl << 8) ^ qf(23) ^ qf( 8))); \ + dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \ + + ((xl >> 6) ^ qf(16) ^ qf( 9))); \ + dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \ + + ((xl << 6) ^ qf(17) ^ qf(10))); \ + dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \ + + ((xl << 4) ^ qf(18) ^ qf(11))); \ + dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \ + + ((xl >> 3) ^ qf(19) ^ qf(12))); \ + dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \ + + ((xl >> 4) ^ qf(20) ^ qf(13))); \ + dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \ + + ((xl >> 7) ^ qf(21) ^ qf(14))); \ + dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \ + + ((xl >> 2) ^ qf(22) ^ qf(15))); \ + } while (0) + +#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) + +#if SPH_64 + +#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH) + +#endif + +static void +compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec32le_aligned(data + 4 * (x)) +#else + sph_u32 mv[16]; + + mv[ 0] = sph_dec32le_aligned(data + 0); + mv[ 1] = sph_dec32le_aligned(data + 4); + mv[ 2] = sph_dec32le_aligned(data + 8); + mv[ 3] = sph_dec32le_aligned(data + 12); + mv[ 4] = sph_dec32le_aligned(data + 16); + mv[ 5] = sph_dec32le_aligned(data + 20); + mv[ 6] = sph_dec32le_aligned(data + 24); + mv[ 7] = sph_dec32le_aligned(data + 28); + mv[ 8] = sph_dec32le_aligned(data + 32); + mv[ 9] = sph_dec32le_aligned(data + 36); + mv[10] = sph_dec32le_aligned(data + 40); + mv[11] = sph_dec32le_aligned(data + 44); + mv[12] = sph_dec32le_aligned(data + 48); + mv[13] = sph_dec32le_aligned(data + 52); + mv[14] = sph_dec32le_aligned(data + 56); + mv[15] = sph_dec32le_aligned(data + 60); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDs; + +#undef M +#undef H +#undef dH +} + +static const sph_u32 final_s[16] = { + SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2), + SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5), + SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8), + SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab), + SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae), + SPH_C32(0xaaaaaaaf) +}; + +static void +bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; +#if SPH_64 + sc->bit_count = 0; +#else + sc->bit_count_high = 0; + sc->bit_count_low = 0; +#endif +} + +static void +bmw32(sph_bmw_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u32 htmp[16]; + sph_u32 *h1, *h2; +#if !SPH_64 + sph_u32 tmp; +#endif + +#if SPH_64 + sc->bit_count += (sph_u64)len << 3; +#else + tmp = sc->bit_count_low; + sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3)); + if (sc->bit_count_low < tmp) + sc->bit_count_high ++; + sc->bit_count_high += len >> 29; +#endif + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u32 *ht; + + compress_small(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u32 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_small(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); +#if SPH_64 + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); +#else + sph_enc32le_aligned(buf + (sizeof sc->buf) - 8, + sc->bit_count_low + n); + sph_enc32le_aligned(buf + (sizeof sc->buf) - 4, + SPH_T32(sc->bit_count_high)); +#endif + compress_small(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc32le_aligned(buf + 4 * u, h2[u]); + compress_small(buf, final_s, h1); + out = dst; + for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) + sph_enc32le(out + 4 * u, h1[v]); +} + +#if SPH_64 + +static void +compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec64le_aligned(data + 8 * (x)) +#else + sph_u64 mv[16]; + + mv[ 0] = sph_dec64le_aligned(data + 0); + mv[ 1] = sph_dec64le_aligned(data + 8); + mv[ 2] = sph_dec64le_aligned(data + 16); + mv[ 3] = sph_dec64le_aligned(data + 24); + mv[ 4] = sph_dec64le_aligned(data + 32); + mv[ 5] = sph_dec64le_aligned(data + 40); + mv[ 6] = sph_dec64le_aligned(data + 48); + mv[ 7] = sph_dec64le_aligned(data + 56); + mv[ 8] = sph_dec64le_aligned(data + 64); + mv[ 9] = sph_dec64le_aligned(data + 72); + mv[10] = sph_dec64le_aligned(data + 80); + mv[11] = sph_dec64le_aligned(data + 88); + mv[12] = sph_dec64le_aligned(data + 96); + mv[13] = sph_dec64le_aligned(data + 104); + mv[14] = sph_dec64le_aligned(data + 112); + mv[15] = sph_dec64le_aligned(data + 120); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDb; + +#undef M +#undef H +#undef dH +} + +static const sph_u64 final_b[16] = { + SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), + SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3), + SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5), + SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7), + SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9), + SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab), + SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad), + SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf) +}; + +static void +bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; + sc->bit_count = 0; +} + +static void +bmw64(sph_bmw_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u64 htmp[16]; + sph_u64 *h1, *h2; + + sc->bit_count += (sph_u64)len << 3; + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u64 *ht; + + compress_big(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w64) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u64 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_big(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); + compress_big(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc64le_aligned(buf + 8 * u, h2[u]); + compress_big(buf, final_b, h1); + out = dst; + for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) + sph_enc64le(out + 8 * u, h1[v]); +} + +#endif + +/* see sph_bmw.h */ +void +sph_bmw224_init(void *cc) +{ + bmw32_init(cc, IV224); +} + +/* see sph_bmw.h */ +void +sph_bmw224(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw224_close(void *cc, void *dst) +{ + sph_bmw224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 7); + sph_bmw224_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw256_init(void *cc) +{ + bmw32_init(cc, IV256); +} + +/* see sph_bmw.h */ +void +sph_bmw256(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw256_close(void *cc, void *dst) +{ + sph_bmw256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 8); + sph_bmw256_init(cc); +} + +#if SPH_64 + +/* see sph_bmw.h */ +void +sph_bmw384_init(void *cc) +{ + bmw64_init(cc, IV384); +} + +/* see sph_bmw.h */ +void +sph_bmw384(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw384_close(void *cc, void *dst) +{ + sph_bmw384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 6); + sph_bmw384_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw512_init(void *cc) +{ + bmw64_init(cc, IV512); +} + +/* see sph_bmw.h */ +void +sph_bmw512(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw512_close(void *cc, void *dst) +{ + sph_bmw512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 8); + sph_bmw512_init(cc); +} + +#endif diff --git a/sph/cubehash.c b/sph/cubehash.c new file mode 100644 index 0000000..9322fe1 --- /dev/null +++ b/sph/cubehash.c @@ -0,0 +1,723 @@ +/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * CubeHash implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_cubehash.h" +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH +#define SPH_SMALL_FOOTPRINT_CUBEHASH 1 +#endif + +/* + * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit + * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302). + * It appears that the optimal settings are: + * -- full unroll, no state copy on the "big" systems (x86, PowerPC) + * -- unroll to 4 or 8, state copy on the "small" system (MIPS) + */ + +#if SPH_SMALL_FOOTPRINT_CUBEHASH + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 4 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 1 +#endif + +#else + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 0 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 0 +#endif + +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22), + SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24), + SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3), + SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774), + SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66), + SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0), + SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314), + SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE), + SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF), + SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE), + SPH_C32(0xFD20151C), SPH_C32(0x00CB573E) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71), + SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63), + SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696), + SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C), + SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00), + SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5), + SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549), + SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285), + SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD), + SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6), + SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A) +}; + +static const sph_u32 IV384[] = { + SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453), + SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88), + SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922), + SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052), + SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E), + SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D), + SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70), + SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4), + SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC), + SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01), + SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E) +}; + +static const sph_u32 IV512[] = { + SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B), + SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C), + SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787), + SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537), + SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33), + SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485), + SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159), + SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9), + SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456), + SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1), + SPH_C32(0x7795D246), SPH_C32(0xD43E3B44) +}; + +#define T32 SPH_T32 +#define ROTL32 SPH_ROTL32 + +#if SPH_CUBEHASH_NOCOPY + +#define DECL_STATE +#define READ_STATE(cc) +#define WRITE_STATE(cc) + +#define x0 ((sc)->state[ 0]) +#define x1 ((sc)->state[ 1]) +#define x2 ((sc)->state[ 2]) +#define x3 ((sc)->state[ 3]) +#define x4 ((sc)->state[ 4]) +#define x5 ((sc)->state[ 5]) +#define x6 ((sc)->state[ 6]) +#define x7 ((sc)->state[ 7]) +#define x8 ((sc)->state[ 8]) +#define x9 ((sc)->state[ 9]) +#define xa ((sc)->state[10]) +#define xb ((sc)->state[11]) +#define xc ((sc)->state[12]) +#define xd ((sc)->state[13]) +#define xe ((sc)->state[14]) +#define xf ((sc)->state[15]) +#define xg ((sc)->state[16]) +#define xh ((sc)->state[17]) +#define xi ((sc)->state[18]) +#define xj ((sc)->state[19]) +#define xk ((sc)->state[20]) +#define xl ((sc)->state[21]) +#define xm ((sc)->state[22]) +#define xn ((sc)->state[23]) +#define xo ((sc)->state[24]) +#define xp ((sc)->state[25]) +#define xq ((sc)->state[26]) +#define xr ((sc)->state[27]) +#define xs ((sc)->state[28]) +#define xt ((sc)->state[29]) +#define xu ((sc)->state[30]) +#define xv ((sc)->state[31]) + +#else + +#define DECL_STATE \ + sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \ + sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \ + sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \ + sph_u32 xo, xp, xq, xr, xs, xt, xu, xv; + +#define READ_STATE(cc) do { \ + x0 = (cc)->state[ 0]; \ + x1 = (cc)->state[ 1]; \ + x2 = (cc)->state[ 2]; \ + x3 = (cc)->state[ 3]; \ + x4 = (cc)->state[ 4]; \ + x5 = (cc)->state[ 5]; \ + x6 = (cc)->state[ 6]; \ + x7 = (cc)->state[ 7]; \ + x8 = (cc)->state[ 8]; \ + x9 = (cc)->state[ 9]; \ + xa = (cc)->state[10]; \ + xb = (cc)->state[11]; \ + xc = (cc)->state[12]; \ + xd = (cc)->state[13]; \ + xe = (cc)->state[14]; \ + xf = (cc)->state[15]; \ + xg = (cc)->state[16]; \ + xh = (cc)->state[17]; \ + xi = (cc)->state[18]; \ + xj = (cc)->state[19]; \ + xk = (cc)->state[20]; \ + xl = (cc)->state[21]; \ + xm = (cc)->state[22]; \ + xn = (cc)->state[23]; \ + xo = (cc)->state[24]; \ + xp = (cc)->state[25]; \ + xq = (cc)->state[26]; \ + xr = (cc)->state[27]; \ + xs = (cc)->state[28]; \ + xt = (cc)->state[29]; \ + xu = (cc)->state[30]; \ + xv = (cc)->state[31]; \ + } while (0) + +#define WRITE_STATE(cc) do { \ + (cc)->state[ 0] = x0; \ + (cc)->state[ 1] = x1; \ + (cc)->state[ 2] = x2; \ + (cc)->state[ 3] = x3; \ + (cc)->state[ 4] = x4; \ + (cc)->state[ 5] = x5; \ + (cc)->state[ 6] = x6; \ + (cc)->state[ 7] = x7; \ + (cc)->state[ 8] = x8; \ + (cc)->state[ 9] = x9; \ + (cc)->state[10] = xa; \ + (cc)->state[11] = xb; \ + (cc)->state[12] = xc; \ + (cc)->state[13] = xd; \ + (cc)->state[14] = xe; \ + (cc)->state[15] = xf; \ + (cc)->state[16] = xg; \ + (cc)->state[17] = xh; \ + (cc)->state[18] = xi; \ + (cc)->state[19] = xj; \ + (cc)->state[20] = xk; \ + (cc)->state[21] = xl; \ + (cc)->state[22] = xm; \ + (cc)->state[23] = xn; \ + (cc)->state[24] = xo; \ + (cc)->state[25] = xp; \ + (cc)->state[26] = xq; \ + (cc)->state[27] = xr; \ + (cc)->state[28] = xs; \ + (cc)->state[29] = xt; \ + (cc)->state[30] = xu; \ + (cc)->state[31] = xv; \ + } while (0) + +#endif + +#define INPUT_BLOCK do { \ + x0 ^= sph_dec32le_aligned(buf + 0); \ + x1 ^= sph_dec32le_aligned(buf + 4); \ + x2 ^= sph_dec32le_aligned(buf + 8); \ + x3 ^= sph_dec32le_aligned(buf + 12); \ + x4 ^= sph_dec32le_aligned(buf + 16); \ + x5 ^= sph_dec32le_aligned(buf + 20); \ + x6 ^= sph_dec32le_aligned(buf + 24); \ + x7 ^= sph_dec32le_aligned(buf + 28); \ + } while (0) + +#define ROUND_EVEN do { \ + xg = T32(x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = T32(x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = T32(x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = T32(x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = T32(x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = T32(x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = T32(x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = T32(x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = T32(x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = T32(x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = T32(xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = T32(xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = T32(xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = T32(xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = T32(xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = T32(xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = T32(x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = T32(x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = T32(xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = T32(xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = T32(xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = T32(xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = T32(xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = T32(xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = T32(x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = T32(x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = T32(x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = T32(x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = T32(x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = T32(x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = T32(x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = T32(x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; \ + } while (0) + +#define ROUND_ODD do { \ + xj = T32(xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = T32(xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = T32(xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = T32(xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = T32(x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = T32(x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = T32(xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = T32(xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = T32(x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = T32(x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = T32(x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = T32(x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = T32(x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = T32(x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = T32(x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = T32(x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = T32(x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = T32(x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = T32(x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = T32(x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = T32(x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = T32(x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = T32(x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = T32(x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = T32(xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = T32(xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = T32(xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = T32(xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = T32(x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = T32(x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = T32(xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = T32(xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; \ + } while (0) + +/* + * There is no need to unroll all 16 rounds. The word-swapping permutation + * is an involution, so we need to unroll an even number of rounds. On + * "big" systems, unrolling 4 rounds yields about 97% of the speed + * achieved with full unrolling; and it keeps the code more compact + * for small architectures. + */ + +#if SPH_CUBEHASH_UNROLL == 2 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 4 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 4; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 8 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 2; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#else + +#define SIXTEEN_ROUNDS do { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } while (0) + +#endif + +static void +cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv) +{ + memcpy(sc->state, iv, sizeof sc->state); + sc->ptr = 0; +} + +static void +cubehash_core(sph_cubehash_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BLOCK; + SIXTEEN_ROUNDS; + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE(sc); + INPUT_BLOCK; + for (i = 0; i < 11; i ++) { + SIXTEEN_ROUNDS; + if (i == 0) + xv ^= SPH_C32(1); + } + WRITE_STATE(sc); + out = dst; + for (z = 0; z < out_size_w32; z ++) + sph_enc32le(out + (z << 2), sc->state[z]); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_init(void *cc) +{ + cubehash_init(cc, IV224); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_close(void *cc, void *dst) +{ + sph_cubehash224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 7); + sph_cubehash224_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_init(void *cc) +{ + cubehash_init(cc, IV256); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_close(void *cc, void *dst) +{ + sph_cubehash256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 8); + sph_cubehash256_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_init(void *cc) +{ + cubehash_init(cc, IV384); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_close(void *cc, void *dst) +{ + sph_cubehash384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 12); + sph_cubehash384_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_init(void *cc) +{ + cubehash_init(cc, IV512); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_close(void *cc, void *dst) +{ + sph_cubehash512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 16); + sph_cubehash512_init(cc); +} +#ifdef __cplusplus +} +#endif diff --git a/sph/echo.c b/sph/echo.c new file mode 100644 index 0000000..667e3f3 --- /dev/null +++ b/sph/echo.c @@ -0,0 +1,1031 @@ +/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * ECHO implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_echo.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO +#define SPH_SMALL_FOOTPRINT_ECHO 1 +#endif + +/* + * Some measures tend to show that the 64-bit implementation offers + * better performance only on a "64-bit architectures", those which have + * actual 64-bit registers. + */ +#if !defined SPH_ECHO_64 && SPH_64_TRUE +#define SPH_ECHO_64 1 +#endif + +/* + * We can use a 64-bit implementation only if a 64-bit type is available. + */ +#if !SPH_64 +#undef SPH_ECHO_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define T32 SPH_T32 +#define C32 SPH_C32 +#if SPH_64 +#define C64 SPH_C64 +#endif + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +#if SPH_ECHO_64 + +#define DECL_STATE_SMALL \ + sph_u64 W[16][2]; + +#define DECL_STATE_BIG \ + sph_u64 W[16][2]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u64 W[16][2], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u64 Wl = W[n][0]; + sph_u64 Wh = W[n][1]; + sph_u32 X0 = (sph_u32)Wl; + sph_u32 X1 = (sph_u32)(Wl >> 32); + sph_u32 X2 = (sph_u32)Wh; + sph_u32 X3 = (sph_u32)(Wh >> 32); + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32); + W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 X0 = (sph_u32)(X[0]); \ + sph_u32 X1 = (sph_u32)(X[0] >> 32); \ + sph_u32 X2 = (sph_u32)(X[1]); \ + sph_u32 X3 = (sph_u32)(X[1] >> 32); \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \ + X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \ + X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 2; n ++) { + sph_u64 a = W[ia][n]; + sph_u64 b = W[ib][n]; + sph_u64 c = W[ic][n]; + sph_u64 d = W[id][n]; + sph_u64 ab = a ^ b; + sph_u64 bc = b ^ c; + sph_u64 cd = c ^ d; + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u64 a = W[ia][n]; \ + sph_u64 b = W[ib][n]; \ + sph_u64 c = W[ic][n]; \ + sph_u64 d = W[id][n]; \ + sph_u64 ab = a ^ b; \ + sph_u64 bc = b ^ c; \ + sph_u64 cd = c ^ d; \ + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 8; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \ + ^ WW[u] ^ WW[u + 8] \ + ^ WW[u + 16] ^ WW[u + 24]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ WW[u] ^ WW[u + 16]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#else + +#define DECL_STATE_SMALL \ + sph_u32 W[16][4]; + +#define DECL_STATE_BIG \ + sph_u32 W[16][4]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 4][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 4][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 8][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 8][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u32 W[16][4], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u32 *X = W[n]; + sph_u32 Y0, Y1, Y2, Y3; + AES_ROUND_LE(X[0], X[1], X[2], X[3], + K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X[0], X[1], X[2], X[3], \ + K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[b][2]; \ + W[b][2] = W[c][2]; \ + W[c][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[b][3]; \ + W[b][3] = W[c][3]; \ + W[c][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[c][2]; \ + W[c][2] = tmp; \ + tmp = W[b][2]; \ + W[b][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[c][3]; \ + W[c][3] = tmp; \ + tmp = W[b][3]; \ + W[b][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 4; n ++) { + sph_u32 a = W[ia][n]; + sph_u32 b = W[ib][n]; + sph_u32 c = W[ic][n]; + sph_u32 d = W[id][n]; + sph_u32 ab = a ^ b; + sph_u32 bc = b ^ c; + sph_u32 cd = c ^ d; + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U + ^ ((ab & C32(0x7F7F7F7F)) << 1); + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U + ^ ((bc & C32(0x7F7F7F7F)) << 1); + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U + ^ ((cd & C32(0x7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u32 a = W[ia][n]; \ + sph_u32 b = W[ib][n]; \ + sph_u32 c = W[ic][n]; \ + sph_u32 d = W[id][n]; \ + sph_u32 ab = a ^ b; \ + sph_u32 bc = b ^ c; \ + sph_u32 cd = c ^ d; \ + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \ + ^ ((ab & C32(0x7F7F7F7F)) << 1); \ + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \ + ^ ((bc & C32(0x7F7F7F7F)) << 1); \ + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \ + ^ ((cd & C32(0x7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + MIX_COLUMN1(a, b, c, d, 2); \ + MIX_COLUMN1(a, b, c, d, 3); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \ + ^ WW[u] ^ WW[u + 16] \ + ^ WW[u + 32] ^ WW[u + 48]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 32; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ WW[u] ^ WW[u + 32]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#endif + +#define INCR_COUNTER(sc, val) do { \ + sc->C0 = T32(sc->C0 + (sph_u32)(val)); \ + if (sc->C0 < (sph_u32)(val)) { \ + if ((sc->C1 = T32(sc->C1 + 1)) == 0) \ + if ((sc->C2 = T32(sc->C2 + 1)) == 0) \ + sc->C3 = T32(sc->C3 + 1); \ + } \ + } while (0) + +static void +echo_small_init(sph_echo_small_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_big_init(sph_echo_big_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; + sc->u.Vb[4][0] = (sph_u64)out_len; + sc->u.Vb[4][1] = 0; + sc->u.Vb[5][0] = (sph_u64)out_len; + sc->u.Vb[5][1] = 0; + sc->u.Vb[6][0] = (sph_u64)out_len; + sc->u.Vb[6][1] = 0; + sc->u.Vb[7][0] = (sph_u64)out_len; + sc->u.Vb[7][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; + sc->u.Vs[4][0] = (sph_u32)out_len; + sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0; + sc->u.Vs[5][0] = (sph_u32)out_len; + sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0; + sc->u.Vs[6][0] = (sph_u32)out_len; + sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0; + sc->u.Vs[7][0] = (sph_u32)out_len; + sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_small_compress(sph_echo_small_context *sc) +{ + DECL_STATE_SMALL + + COMPRESS_SMALL(sc); +} + +static void +echo_big_compress(sph_echo_big_context *sc) +{ + DECL_STATE_BIG + + COMPRESS_BIG(sc); +} + +static void +echo_small_core(sph_echo_small_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1536); + echo_small_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_big_core(sph_echo_big_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1024); + echo_big_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[32]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_small_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_small_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_small_init(sc, out_size_w32 << 5); +} + +static void +echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[64]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_big_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_big_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_big_init(sc, out_size_w32 << 5); +} + +/* see sph_echo.h */ +void +sph_echo224_init(void *cc) +{ + echo_small_init(cc, 224); +} + +/* see sph_echo.h */ +void +sph_echo224(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo224_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo256_init(void *cc) +{ + echo_small_init(cc, 256); +} + +/* see sph_echo.h */ +void +sph_echo256(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo256_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo384_init(void *cc) +{ + echo_big_init(cc, 384); +} + +/* see sph_echo.h */ +void +sph_echo384(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo384_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo512_init(void *cc) +{ + echo_big_init(cc, 512); +} + +/* see sph_echo.h */ +void +sph_echo512(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo512_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 16); +} + +/* see sph_echo.h */ +void +sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 16); +} +#ifdef __cplusplus +} +#endif diff --git a/fugue.c b/sph/fugue.c similarity index 100% rename from fugue.c rename to sph/fugue.c diff --git a/groestl.c b/sph/groestl.c similarity index 100% rename from groestl.c rename to sph/groestl.c diff --git a/sph/jh.c b/sph/jh.c new file mode 100644 index 0000000..4e26617 --- /dev/null +++ b/sph/jh.c @@ -0,0 +1,1107 @@ +/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ +/* + * JH implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_jh.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH +#define SPH_SMALL_FOOTPRINT_JH 1 +#endif + +#if !defined SPH_JH_64 && SPH_64_TRUE +#define SPH_JH_64 1 +#endif + +#if !SPH_64 +#undef SPH_JH_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal bitslice representation may use either big-endian or + * little-endian (true bitslice operations do not care about the bit + * ordering, and the bit-swapping linear operations in JH happen to + * be invariant through endianness-swapping). The constants must be + * defined according to the chosen endianness; we use some + * byte-swapping macros for that. + */ + +#if SPH_LITTLE_ENDIAN + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#endif + +#endif + +#define Sb(x0, x1, x2, x3, c) do { \ + x3 = ~x3; \ + x0 ^= (c) & ~x2; \ + tmp = (c) ^ (x0 & x1); \ + x0 ^= x2 & x3; \ + x3 ^= ~x1 & x2; \ + x1 ^= x0 & x2; \ + x2 ^= x0 & ~x3; \ + x0 ^= x1 | x3; \ + x3 ^= x1 & x2; \ + x1 ^= tmp & x0; \ + x2 ^= tmp; \ + } while (0) + +#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + x4 ^= x1; \ + x5 ^= x2; \ + x6 ^= x3 ^ x0; \ + x7 ^= x0; \ + x0 ^= x5; \ + x1 ^= x6; \ + x2 ^= x7 ^ x4; \ + x3 ^= x4; \ + } while (0) + +#if SPH_JH_64 + +static const sph_u64 C[] = { + C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), + C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), + C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), + C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), + C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), + C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), + C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), + C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), + C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), + C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), + C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), + C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), + C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), + C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), + C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), + C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), + C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), + C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), + C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), + C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), + C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), + C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), + C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), + C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), + C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), + C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), + C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), + C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), + C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), + C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), + C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), + C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), + C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), + C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), + C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), + C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), + C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), + C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), + C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), + C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), + C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), + C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), + C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), + C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), + C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), + C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), + C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), + C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), + C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), + C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), + C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), + C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), + C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), + C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), + C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), + C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), + C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), + C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), + C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), + C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), + C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), + C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), + C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), + C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), + C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), + C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), + C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), + C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), + C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), + C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), + C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), + C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), + C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), + C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), + C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), + C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), + C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), + C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), + C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), + C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), + C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), + C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), + C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), + C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) +}; + +#define Ceven_hi(r) (C[((r) << 2) + 0]) +#define Ceven_lo(r) (C[((r) << 2) + 1]) +#define Codd_hi(r) (C[((r) << 2) + 2]) +#define Codd_lo(r) (C[((r) << 2) + 3]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ + Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ + x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ + Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ + x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u64 t = (x ## h & (c)) << (n); \ + x ## h = ((x ## h >> (n)) & (c)) | t; \ + t = (x ## l & (c)) << (n); \ + x ## l = ((x ## l >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) +#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) +#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) +#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) +#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) +#define W6(x) do { \ + sph_u64 t = x ## h; \ + x ## h = x ## l; \ + x ## l = t; \ + } while (0) + +#define DECL_STATE \ + sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ + sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ + sph_u64 tmp; + +#define READ_STATE(state) do { \ + h0h = (state)->H.wide[ 0]; \ + h0l = (state)->H.wide[ 1]; \ + h1h = (state)->H.wide[ 2]; \ + h1l = (state)->H.wide[ 3]; \ + h2h = (state)->H.wide[ 4]; \ + h2l = (state)->H.wide[ 5]; \ + h3h = (state)->H.wide[ 6]; \ + h3l = (state)->H.wide[ 7]; \ + h4h = (state)->H.wide[ 8]; \ + h4l = (state)->H.wide[ 9]; \ + h5h = (state)->H.wide[10]; \ + h5l = (state)->H.wide[11]; \ + h6h = (state)->H.wide[12]; \ + h6l = (state)->H.wide[13]; \ + h7h = (state)->H.wide[14]; \ + h7l = (state)->H.wide[15]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.wide[ 0] = h0h; \ + (state)->H.wide[ 1] = h0l; \ + (state)->H.wide[ 2] = h1h; \ + (state)->H.wide[ 3] = h1l; \ + (state)->H.wide[ 4] = h2h; \ + (state)->H.wide[ 5] = h2l; \ + (state)->H.wide[ 6] = h3h; \ + (state)->H.wide[ 7] = h3l; \ + (state)->H.wide[ 8] = h4h; \ + (state)->H.wide[ 9] = h4l; \ + (state)->H.wide[10] = h5h; \ + (state)->H.wide[11] = h5l; \ + (state)->H.wide[12] = h6h; \ + (state)->H.wide[13] = h6l; \ + (state)->H.wide[14] = h7h; \ + (state)->H.wide[15] = h7l; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u64 m0h = dec64e_aligned(buf + 0); \ + sph_u64 m0l = dec64e_aligned(buf + 8); \ + sph_u64 m1h = dec64e_aligned(buf + 16); \ + sph_u64 m1l = dec64e_aligned(buf + 24); \ + sph_u64 m2h = dec64e_aligned(buf + 32); \ + sph_u64 m2l = dec64e_aligned(buf + 40); \ + sph_u64 m3h = dec64e_aligned(buf + 48); \ + sph_u64 m3l = dec64e_aligned(buf + 56); \ + h0h ^= m0h; \ + h0l ^= m0l; \ + h1h ^= m1h; \ + h1l ^= m1l; \ + h2h ^= m2h; \ + h2l ^= m2l; \ + h3h ^= m3h; \ + h3l ^= m3l; + +#define INPUT_BUF2 \ + h4h ^= m0h; \ + h4l ^= m0l; \ + h5h ^= m1h; \ + h5l ^= m1l; \ + h6h ^= m2h; \ + h6l ^= m2l; \ + h7h ^= m3h; \ + h7l ^= m3l; + +static const sph_u64 IV224[] = { + C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7), + C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494), + C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc), + C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1), + C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099), + C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68), + C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0), + C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e) +}; + +static const sph_u64 IV256[] = { + C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), + C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), + C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), + C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), + C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), + C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), + C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), + C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) +}; + +static const sph_u64 IV384[] = { + C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b), + C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267), + C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249), + C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1), + C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e), + C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda), + C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71), + C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f) +}; + +static const sph_u64 IV512[] = { + C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), + C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), + C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), + C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), + C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), + C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), + C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), + C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) +}; + +#else + +static const sph_u32 C[] = { + C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a), + C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6), + C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0), + C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a), + C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a), + C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb), + C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980), + C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc), + C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515), + C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850), + C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d), + C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce), + C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6), + C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a), + C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb), + C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197), + C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d), + C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c), + C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f), + C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80), + C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e), + C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315), + C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20), + C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36), + C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974), + C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186), + C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667), + C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727), + C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f), + C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0), + C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281), + C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062), + C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00), + C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8), + C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228), + C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2), + C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385), + C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a), + C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704), + C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a), + C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09), + C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0), + C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31), + C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48), + C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1), + C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f), + C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701), + C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a), + C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b), + C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630), + C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456), + C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae), + C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68), + C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116), + C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220), + C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518), + C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0), + C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba), + C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea), + C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee), + C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe), + C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842), + C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315), + C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83), + C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034), + C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd), + C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49), + C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d), + C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0), + C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53), + C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492), + C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c), + C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6), + C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d), + C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2), + C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc), + C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c), + C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175), + C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3), + C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f), + C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09), + C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2), + C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6), + C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56), + C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc), + C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e), + C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8), + C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163), + C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4), + C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16), + C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3), + C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30), + C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8), + C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992), + C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57), + C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505), + C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284), + C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547), + C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807), + C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e), + C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2), + C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883), + C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226), + C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7), + C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6), + C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e), + C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93), + C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9), + C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633), + C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570), + C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1), + C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2) +}; + +#define Ceven_w3(r) (C[((r) << 3) + 0]) +#define Ceven_w2(r) (C[((r) << 3) + 1]) +#define Ceven_w1(r) (C[((r) << 3) + 2]) +#define Ceven_w0(r) (C[((r) << 3) + 3]) +#define Codd_w3(r) (C[((r) << 3) + 4]) +#define Codd_w2(r) (C[((r) << 3) + 5]) +#define Codd_w1(r) (C[((r) << 3) + 6]) +#define Codd_w0(r) (C[((r) << 3) + 7]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \ + Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \ + Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \ + Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \ + x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \ + Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \ + x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \ + Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \ + x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \ + Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \ + x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u32 t = (x ## 3 & (c)) << (n); \ + x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \ + t = (x ## 2 & (c)) << (n); \ + x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \ + t = (x ## 1 & (c)) << (n); \ + x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \ + t = (x ## 0 & (c)) << (n); \ + x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C32(0x55555555), 1) +#define W1(x) Wz(x, SPH_C32(0x33333333), 2) +#define W2(x) Wz(x, SPH_C32(0x0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C32(0x00FF00FF), 8) +#define W4(x) Wz(x, SPH_C32(0x0000FFFF), 16) +#define W5(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 2; \ + x ## 2 = t; \ + t = x ## 1; \ + x ## 1 = x ## 0; \ + x ## 0 = t; \ + } while (0) +#define W6(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 1; \ + x ## 1 = t; \ + t = x ## 2; \ + x ## 2 = x ## 0; \ + x ## 0 = t; \ + } while (0) + +#define DECL_STATE \ + sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \ + sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \ + sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \ + sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \ + sph_u32 tmp; + +#define READ_STATE(state) do { \ + h03 = (state)->H.narrow[ 0]; \ + h02 = (state)->H.narrow[ 1]; \ + h01 = (state)->H.narrow[ 2]; \ + h00 = (state)->H.narrow[ 3]; \ + h13 = (state)->H.narrow[ 4]; \ + h12 = (state)->H.narrow[ 5]; \ + h11 = (state)->H.narrow[ 6]; \ + h10 = (state)->H.narrow[ 7]; \ + h23 = (state)->H.narrow[ 8]; \ + h22 = (state)->H.narrow[ 9]; \ + h21 = (state)->H.narrow[10]; \ + h20 = (state)->H.narrow[11]; \ + h33 = (state)->H.narrow[12]; \ + h32 = (state)->H.narrow[13]; \ + h31 = (state)->H.narrow[14]; \ + h30 = (state)->H.narrow[15]; \ + h43 = (state)->H.narrow[16]; \ + h42 = (state)->H.narrow[17]; \ + h41 = (state)->H.narrow[18]; \ + h40 = (state)->H.narrow[19]; \ + h53 = (state)->H.narrow[20]; \ + h52 = (state)->H.narrow[21]; \ + h51 = (state)->H.narrow[22]; \ + h50 = (state)->H.narrow[23]; \ + h63 = (state)->H.narrow[24]; \ + h62 = (state)->H.narrow[25]; \ + h61 = (state)->H.narrow[26]; \ + h60 = (state)->H.narrow[27]; \ + h73 = (state)->H.narrow[28]; \ + h72 = (state)->H.narrow[29]; \ + h71 = (state)->H.narrow[30]; \ + h70 = (state)->H.narrow[31]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.narrow[ 0] = h03; \ + (state)->H.narrow[ 1] = h02; \ + (state)->H.narrow[ 2] = h01; \ + (state)->H.narrow[ 3] = h00; \ + (state)->H.narrow[ 4] = h13; \ + (state)->H.narrow[ 5] = h12; \ + (state)->H.narrow[ 6] = h11; \ + (state)->H.narrow[ 7] = h10; \ + (state)->H.narrow[ 8] = h23; \ + (state)->H.narrow[ 9] = h22; \ + (state)->H.narrow[10] = h21; \ + (state)->H.narrow[11] = h20; \ + (state)->H.narrow[12] = h33; \ + (state)->H.narrow[13] = h32; \ + (state)->H.narrow[14] = h31; \ + (state)->H.narrow[15] = h30; \ + (state)->H.narrow[16] = h43; \ + (state)->H.narrow[17] = h42; \ + (state)->H.narrow[18] = h41; \ + (state)->H.narrow[19] = h40; \ + (state)->H.narrow[20] = h53; \ + (state)->H.narrow[21] = h52; \ + (state)->H.narrow[22] = h51; \ + (state)->H.narrow[23] = h50; \ + (state)->H.narrow[24] = h63; \ + (state)->H.narrow[25] = h62; \ + (state)->H.narrow[26] = h61; \ + (state)->H.narrow[27] = h60; \ + (state)->H.narrow[28] = h73; \ + (state)->H.narrow[29] = h72; \ + (state)->H.narrow[30] = h71; \ + (state)->H.narrow[31] = h70; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u32 m03 = dec32e_aligned(buf + 0); \ + sph_u32 m02 = dec32e_aligned(buf + 4); \ + sph_u32 m01 = dec32e_aligned(buf + 8); \ + sph_u32 m00 = dec32e_aligned(buf + 12); \ + sph_u32 m13 = dec32e_aligned(buf + 16); \ + sph_u32 m12 = dec32e_aligned(buf + 20); \ + sph_u32 m11 = dec32e_aligned(buf + 24); \ + sph_u32 m10 = dec32e_aligned(buf + 28); \ + sph_u32 m23 = dec32e_aligned(buf + 32); \ + sph_u32 m22 = dec32e_aligned(buf + 36); \ + sph_u32 m21 = dec32e_aligned(buf + 40); \ + sph_u32 m20 = dec32e_aligned(buf + 44); \ + sph_u32 m33 = dec32e_aligned(buf + 48); \ + sph_u32 m32 = dec32e_aligned(buf + 52); \ + sph_u32 m31 = dec32e_aligned(buf + 56); \ + sph_u32 m30 = dec32e_aligned(buf + 60); \ + h03 ^= m03; \ + h02 ^= m02; \ + h01 ^= m01; \ + h00 ^= m00; \ + h13 ^= m13; \ + h12 ^= m12; \ + h11 ^= m11; \ + h10 ^= m10; \ + h23 ^= m23; \ + h22 ^= m22; \ + h21 ^= m21; \ + h20 ^= m20; \ + h33 ^= m33; \ + h32 ^= m32; \ + h31 ^= m31; \ + h30 ^= m30; + +#define INPUT_BUF2 \ + h43 ^= m03; \ + h42 ^= m02; \ + h41 ^= m01; \ + h40 ^= m00; \ + h53 ^= m13; \ + h52 ^= m12; \ + h51 ^= m11; \ + h50 ^= m10; \ + h63 ^= m23; \ + h62 ^= m22; \ + h61 ^= m21; \ + h60 ^= m20; \ + h73 ^= m33; \ + h72 ^= m32; \ + h71 ^= m31; \ + h70 ^= m30; + +static const sph_u32 IV224[] = { + C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7), + C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494), + C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc), + C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1), + C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099), + C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68), + C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0), + C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e) +}; + +static const sph_u32 IV256[] = { + C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1), + C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03), + C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477), + C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8), + C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262), + C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c), + C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f), + C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769) +}; + +static const sph_u32 IV384[] = { + C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b), + C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267), + C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249), + C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1), + C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e), + C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda), + C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71), + C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f) +}; + +static const sph_u32 IV512[] = { + C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543), + C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361), + C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80), + C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7), + C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a), + C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199), + C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156), + C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b) +}; + +#endif + +#define SL(ro) SLu(r + ro, ro) + +#define SLu(r, ro) do { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + W ## ro(h1); \ + W ## ro(h3); \ + W ## ro(h5); \ + W ## ro(h7); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_JH + +#if SPH_JH_64 + +/* + * The "small footprint" 64-bit version just uses a partially unrolled + * loop. + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#else + +#define E8 do { \ + unsigned r, g; \ + for (r = g = 0; r < 42; r ++) { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + switch (g) { \ + case 0: \ + W0(h1); \ + W0(h3); \ + W0(h5); \ + W0(h7); \ + break; \ + case 1: \ + W1(h1); \ + W1(h3); \ + W1(h5); \ + W1(h7); \ + break; \ + case 2: \ + W2(h1); \ + W2(h3); \ + W2(h5); \ + W2(h7); \ + break; \ + case 3: \ + W3(h1); \ + W3(h3); \ + W3(h5); \ + W3(h7); \ + break; \ + case 4: \ + W4(h1); \ + W4(h3); \ + W4(h5); \ + W4(h7); \ + break; \ + case 5: \ + W5(h1); \ + W5(h3); \ + W5(h5); \ + W5(h7); \ + break; \ + case 6: \ + W6(h1); \ + W6(h3); \ + W6(h5); \ + W6(h7); \ + break; \ + } \ + if (++ g == 7) \ + g = 0; \ + } \ + } while (0) + +#endif + +#else + +#if SPH_JH_64 + +/* + * On a "true 64-bit" architecture, we can unroll at will. + */ + +#define E8 do { \ + SLu( 0, 0); \ + SLu( 1, 1); \ + SLu( 2, 2); \ + SLu( 3, 3); \ + SLu( 4, 4); \ + SLu( 5, 5); \ + SLu( 6, 6); \ + SLu( 7, 0); \ + SLu( 8, 1); \ + SLu( 9, 2); \ + SLu(10, 3); \ + SLu(11, 4); \ + SLu(12, 5); \ + SLu(13, 6); \ + SLu(14, 0); \ + SLu(15, 1); \ + SLu(16, 2); \ + SLu(17, 3); \ + SLu(18, 4); \ + SLu(19, 5); \ + SLu(20, 6); \ + SLu(21, 0); \ + SLu(22, 1); \ + SLu(23, 2); \ + SLu(24, 3); \ + SLu(25, 4); \ + SLu(26, 5); \ + SLu(27, 6); \ + SLu(28, 0); \ + SLu(29, 1); \ + SLu(30, 2); \ + SLu(31, 3); \ + SLu(32, 4); \ + SLu(33, 5); \ + SLu(34, 6); \ + SLu(35, 0); \ + SLu(36, 1); \ + SLu(37, 2); \ + SLu(38, 3); \ + SLu(39, 4); \ + SLu(40, 5); \ + SLu(41, 6); \ + } while (0) + +#else + +/* + * We are not aiming at a small footprint, but we are still using a + * 32-bit implementation. Full loop unrolling would smash the L1 + * cache on some "big" architectures (32 kB L1 cache). + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#endif + +#endif + +static void +jh_init(sph_jh_context *sc, const void *iv) +{ + sc->ptr = 0; +#if SPH_JH_64 + memcpy(sc->H.wide, iv, sizeof sc->H.wide); +#else + memcpy(sc->H.narrow, iv, sizeof sc->H.narrow); +#endif +#if SPH_64 + sc->block_count = 0; +#else + sc->block_count_high = 0; + sc->block_count_low = 0; +#endif +} + +static void +jh_core(sph_jh_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BUF1; + E8; + INPUT_BUF2; +#if SPH_64 + sc->block_count ++; +#else + if ((sc->block_count_low = SPH_T32( + sc->block_count_low + 1)) == 0) + sc->block_count_high ++; +#endif + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +jh_close(sph_jh_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32, const void *iv) +{ + unsigned z; + unsigned char buf[128]; + size_t numz, u; +#if SPH_64 + sph_u64 l0, l1; +#else + sph_u32 l0, l1, l2, l3; +#endif + + z = 0x80 >> n; + buf[0] = ((ub & -z) | z) & 0xFF; + if (sc->ptr == 0 && n == 0) { + numz = 47; + } else { + numz = 111 - sc->ptr; + } + memset(buf + 1, 0, numz); +#if SPH_64 + l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n; + l1 = SPH_T64(sc->block_count >> 55); + sph_enc64be(buf + numz + 1, l1); + sph_enc64be(buf + numz + 9, l0); +#else + l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n; + l1 = SPH_T32(sc->block_count_low >> 23) + + SPH_T32(sc->block_count_high << 9); + l2 = SPH_T32(sc->block_count_high >> 23); + l3 = 0; + sph_enc32be(buf + numz + 1, l3); + sph_enc32be(buf + numz + 5, l2); + sph_enc32be(buf + numz + 9, l1); + sph_enc32be(buf + numz + 13, l0); +#endif + jh_core(sc, buf, numz + 17); +#if SPH_JH_64 + for (u = 0; u < 8; u ++) + enc64e(buf + (u << 3), sc->H.wide[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(buf + (u << 2), sc->H.narrow[u + 16]); +#endif + memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2); + jh_init(sc, iv); +} + +/* see sph_jh.h */ +void +sph_jh224_init(void *cc) +{ + jh_init(cc, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh224_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh256_init(void *cc) +{ + jh_init(cc, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh256_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh384_init(void *cc) +{ + jh_init(cc, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh384_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh512_init(void *cc) +{ + jh_init(cc, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh512_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 16, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 16, IV512); +} diff --git a/keccak.c b/sph/keccak.c similarity index 100% rename from keccak.c rename to sph/keccak.c diff --git a/sph/luffa.c b/sph/luffa.c new file mode 100644 index 0000000..a761bea --- /dev/null +++ b/sph/luffa.c @@ -0,0 +1,1426 @@ +/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */ +/* + * Luffa implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_luffa.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL +#define SPH_LUFFA_PARALLEL 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 V_INIT[5][8] = { + { + SPH_C32(0x6d251e69), SPH_C32(0x44b051e0), + SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465), + SPH_C32(0x6e292011), SPH_C32(0x90152df4), + SPH_C32(0xee058139), SPH_C32(0xdef610bb) + }, { + SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256), + SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3), + SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3), + SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581) + }, { + SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781), + SPH_C32(0x04016ce5), SPH_C32(0xad659c05), + SPH_C32(0x0306194f), SPH_C32(0x666d1836), + SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7) + }, { + SPH_C32(0x858075d5), SPH_C32(0x36d79cce), + SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67), + SPH_C32(0x35870c6a), SPH_C32(0x57e9e923), + SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce) + }, { + SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22), + SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363), + SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1), + SPH_C32(0xb07224cc), SPH_C32(0x03e86cea) + } +}; + +static const sph_u32 RC00[8] = { + SPH_C32(0x303994a6), SPH_C32(0xc0e65299), + SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e), + SPH_C32(0x1e00108f), SPH_C32(0x7800423d), + SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12) +}; + +static const sph_u32 RC04[8] = { + SPH_C32(0xe0337818), SPH_C32(0x441ba90d), + SPH_C32(0x7f34d442), SPH_C32(0x9389217f), + SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4), + SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d) +}; + +static const sph_u32 RC10[8] = { + SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae), + SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51), + SPH_C32(0x707a3d45), SPH_C32(0xaeb28562), + SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e) +}; + +static const sph_u32 RC14[8] = { + SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4), + SPH_C32(0xbd09caca), SPH_C32(0xf4272b28), + SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b), + SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW010[8] = { + SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299), + SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e), + SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d), + SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12) +}; + +static const sph_u64 RCW014[8] = { + SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d), + SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f), + SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4), + SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d) +}; + +#endif + +static const sph_u32 RC20[8] = { + SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25), + SPH_C32(0x7ad8818f), SPH_C32(0x8438764a), + SPH_C32(0xbb6de032), SPH_C32(0xedb780c8), + SPH_C32(0xd9847356), SPH_C32(0xa2c78434) +}; + +static const sph_u32 RC24[8] = { + SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72), + SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7), + SPH_C32(0x78e38b9d), SPH_C32(0x27586719), + SPH_C32(0x36eda57f), SPH_C32(0x703aace7) +}; + +static const sph_u32 RC30[8] = { + SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95), + SPH_C32(0x4e608a22), SPH_C32(0x56d858fe), + SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d), + SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208) +}; + +static const sph_u32 RC34[8] = { + SPH_C32(0xe028c9bf), SPH_C32(0x44756f91), + SPH_C32(0x7e8fce32), SPH_C32(0x956548be), + SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5), + SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW230[8] = { + SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25), + SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a), + SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8), + SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434) +}; + + +static const sph_u64 RCW234[8] = { + SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72), + SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7), + SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719), + SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7) +}; + +#endif + +static const sph_u32 RC40[8] = { + SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa), + SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9), + SPH_C32(0x78602649), SPH_C32(0x8edae952), + SPH_C32(0x3b6ba548), SPH_C32(0xedae9520) +}; + +static const sph_u32 RC44[8] = { + SPH_C32(0x5090d577), SPH_C32(0x2d1925ab), + SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0), + SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3), + SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31) +}; + +#define DECL_TMP8(w) \ + sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7; + +#define M2(d, s) do { \ + sph_u32 tmp = s ## 7; \ + d ## 7 = s ## 6; \ + d ## 6 = s ## 5; \ + d ## 5 = s ## 4; \ + d ## 4 = s ## 3 ^ tmp; \ + d ## 3 = s ## 2 ^ tmp; \ + d ## 2 = s ## 1; \ + d ## 1 = s ## 0 ^ tmp; \ + d ## 0 = tmp; \ + } while (0) + +#define XOR(d, s1, s2) do { \ + d ## 0 = s1 ## 0 ^ s2 ## 0; \ + d ## 1 = s1 ## 1 ^ s2 ## 1; \ + d ## 2 = s1 ## 2 ^ s2 ## 2; \ + d ## 3 = s1 ## 3 ^ s2 ## 3; \ + d ## 4 = s1 ## 4 ^ s2 ## 4; \ + d ## 5 = s1 ## 5 ^ s2 ## 5; \ + d ## 6 = s1 ## 6 ^ s2 ## 6; \ + d ## 7 = s1 ## 7 ^ s2 ## 7; \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define SUB_CRUMB_GEN(a0, a1, a2, a3, width) do { \ + sph_u ## width tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T ## width(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T ## width(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#define SUB_CRUMB(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 32) +#define SUB_CRUMBW(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 64) + + +#if 0 + +#define ROL32W(x, n) SPH_T64( \ + (((x) << (n)) \ + & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \ + | (((x) >> (32 - (n))) \ + & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n)))) + +#define MIX_WORDW(u, v) do { \ + (v) ^= (u); \ + (u) = ROL32W((u), 2) ^ (v); \ + (v) = ROL32W((v), 14) ^ (u); \ + (u) = ROL32W((u), 10) ^ (v); \ + (v) = ROL32W((v), 1); \ + } while (0) + +#endif + +#define MIX_WORDW(u, v) do { \ + sph_u32 ul, uh, vl, vh; \ + (v) ^= (u); \ + ul = SPH_T32((sph_u32)(u)); \ + uh = SPH_T32((sph_u32)((u) >> 32)); \ + vl = SPH_T32((sph_u32)(v)); \ + vh = SPH_T32((sph_u32)((v) >> 32)); \ + ul = SPH_ROTL32(ul, 2) ^ vl; \ + vl = SPH_ROTL32(vl, 14) ^ ul; \ + ul = SPH_ROTL32(ul, 10) ^ vl; \ + vl = SPH_ROTL32(vl, 1); \ + uh = SPH_ROTL32(uh, 2) ^ vh; \ + vh = SPH_ROTL32(vh, 14) ^ uh; \ + uh = SPH_ROTL32(uh, 10) ^ vh; \ + vh = SPH_ROTL32(vh, 1); \ + (u) = (sph_u64)ul | ((sph_u64)uh << 32); \ + (v) = (sph_u64)vl | ((sph_u64)vh << 32); \ + } while (0) + +#else + +#define SUB_CRUMB(a0, a1, a2, a3) do { \ + sph_u32 tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T32(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T32(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#endif + +#define MIX_WORD(u, v) do { \ + (v) ^= (u); \ + (u) = SPH_ROTL32((u), 2) ^ (v); \ + (v) = SPH_ROTL32((v), 14) ^ (u); \ + (u) = SPH_ROTL32((u), 10) ^ (v); \ + (v) = SPH_ROTL32((v), 1); \ + } while (0) + +#define DECL_STATE3 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; + +#define READ_STATE3(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + } while (0) + +#define WRITE_STATE3(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + } while (0) + +#define MI3 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(a, a, V2); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V0, M, V0); \ + M2(M, M); \ + XOR(V1, a, V1); \ + XOR(V1, M, V1); \ + M2(M, M); \ + XOR(V2, a, V2); \ + XOR(V2, M, V2); \ + } while (0) + +#define TWEAK3 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P3 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK3; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#else + +#define P3 do { \ + int r; \ + TWEAK3; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE4 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; + +#define READ_STATE4(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + } while (0) + +#define WRITE_STATE4(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + } while (0) + +#define MI4 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + M2(b, V0); \ + XOR(b, b, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V0); \ + XOR(V0, b, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + } while (0) + +#define TWEAK4 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P4 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK4; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + } while (0) + +#else + +#define P4 do { \ + int r; \ + TWEAK4; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE5 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \ + sph_u32 V40, V41, V42, V43, V44, V45, V46, V47; + +#define READ_STATE5(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + V40 = (state)->V[4][0]; \ + V41 = (state)->V[4][1]; \ + V42 = (state)->V[4][2]; \ + V43 = (state)->V[4][3]; \ + V44 = (state)->V[4][4]; \ + V45 = (state)->V[4][5]; \ + V46 = (state)->V[4][6]; \ + V47 = (state)->V[4][7]; \ + } while (0) + +#define WRITE_STATE5(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + (state)->V[4][0] = V40; \ + (state)->V[4][1] = V41; \ + (state)->V[4][2] = V42; \ + (state)->V[4][3] = V43; \ + (state)->V[4][4] = V44; \ + (state)->V[4][5] = V45; \ + (state)->V[4][6] = V46; \ + (state)->V[4][7] = V47; \ + } while (0) + +#define MI5 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + XOR(a, a, V4); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + XOR(V4, a, V4); \ + M2(b, V0); \ + XOR(b, b, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V0); \ + M2(V0, b); \ + XOR(V0, V0, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, b); \ + XOR(V0, V0, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + M2(M, M); \ + XOR(V4, V4, M); \ + } while (0) + +#define TWEAK5 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + V44 = SPH_ROTL32(V44, 4); \ + V45 = SPH_ROTL32(V45, 4); \ + V46 = SPH_ROTL32(V46, 4); \ + V47 = SPH_ROTL32(V47, 4); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P5 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK5; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#else + +#define P5 do { \ + int r; \ + TWEAK5; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#endif + +static void +luffa3(sph_luffa224_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE3(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI3; + P3; + ptr = 0; + } + } + WRITE_STATE3(sc); + sc->ptr = ptr; +} + +static void +luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE3(sc); + for (i = 0; i < 2; i ++) { + MI3; + P3; + memset(buf, 0, sizeof sc->buf); + } + out = dst; + sph_enc32be(out + 0, V00 ^ V10 ^ V20); + sph_enc32be(out + 4, V01 ^ V11 ^ V21); + sph_enc32be(out + 8, V02 ^ V12 ^ V22); + sph_enc32be(out + 12, V03 ^ V13 ^ V23); + sph_enc32be(out + 16, V04 ^ V14 ^ V24); + sph_enc32be(out + 20, V05 ^ V15 ^ V25); + sph_enc32be(out + 24, V06 ^ V16 ^ V26); + if (out_size_w32 > 7) + sph_enc32be(out + 28, V07 ^ V17 ^ V27); +} + +static void +luffa4(sph_luffa384_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE4(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI4; + P4; + ptr = 0; + } + } + WRITE_STATE4(sc); + sc->ptr = ptr; +} + +static void +luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE4(sc); + for (i = 0; i < 3; i ++) { + MI4; + P4; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33); + break; + } + } +} + +static void +luffa5(sph_luffa512_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE5(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI5; + P5; + ptr = 0; + } + } + WRITE_STATE5(sc); + sc->ptr = ptr; +} + +static void +luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE5(sc); + for (i = 0; i < 3; i ++) { + MI5; + P5; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + } + } +} + +/* see sph_luffa.h */ +void +sph_luffa224_init(void *cc) +{ + sph_luffa224_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa224(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa224_close(void *cc, void *dst) +{ + sph_luffa224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 7); + sph_luffa224_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa256_init(void *cc) +{ + sph_luffa256_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa256(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa256_close(void *cc, void *dst) +{ + sph_luffa256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 8); + sph_luffa256_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa384_init(void *cc) +{ + sph_luffa384_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa384(void *cc, const void *data, size_t len) +{ + luffa4(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa384_close(void *cc, void *dst) +{ + sph_luffa384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa4_close(cc, ub, n, dst); + sph_luffa384_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa512_init(void *cc) +{ + sph_luffa512_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa512(void *cc, const void *data, size_t len) +{ + luffa5(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa512_close(void *cc, void *dst) +{ + sph_luffa512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa5_close(cc, ub, n, dst); + sph_luffa512_init(cc); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sph/shavite.c b/sph/shavite.c new file mode 100644 index 0000000..85074f3 --- /dev/null +++ b/sph/shavite.c @@ -0,0 +1,1764 @@ +/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHAvite-3 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_shavite.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE +#define SPH_SMALL_FOOTPRINT_SHAVITE 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define C32 SPH_C32 + +/* + * As of round 2 of the SHA-3 competition, the published reference + * implementation and test vectors are wrong, because they use + * big-endian AES tables while the internal decoding uses little-endian. + * The code below follows the specification. To turn it into a code + * which follows the reference implementation (the one called "BugFix" + * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out + * the code below (from the '#define AES_BIG_ENDIAN...' to the definition + * of the AES_ROUND_NOKEY macro) and replace it with the version which + * is commented out afterwards. + */ + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371), + C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017) +}; + +static const sph_u32 IV256[] = { + C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6), + C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B) +}; + +static const sph_u32 IV384[] = { + C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47), + C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147), + C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C), + C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3) +}; + +static const sph_u32 IV512[] = { + C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), + C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), + C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47), + C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + +/* + * This is the code needed to match the "reference implementation" as + * published on Nov 23rd, 2009, instead of the published specification. + * + +#define AES_BIG_ENDIAN 1 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0), + C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8) +}; + +static const sph_u32 IV256[] = { + C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5), + C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1) +}; + +static const sph_u32 IV384[] = { + C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4), + C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9), + C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B), + C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25) +}; + +static const sph_u32 IV512[] = { + C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47), + C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186), + C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316), + C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + + */ + +#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \ + sph_u32 kt; \ + AES_ROUND_NOKEY(k1, k2, k3, k0); \ + kt = (k0); \ + (k0) = (k1); \ + (k1) = (k2); \ + (k2) = (k3); \ + (k3) = kt; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 rk[144]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 64); +#else + for (u = 0; u < 16; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 16; + for (r = 0; r < 4; r ++) { + for (s = 0; s < 2; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 16) { + rk[ 16] ^= sc->count0; + rk[ 17] ^= SPH_T32(~sc->count1); + } else if (u == 56) { + rk[ 57] ^= sc->count1; + rk[ 58] ^= SPH_T32(~sc->count0); + } + u += 4; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 84) { + rk[ 86] ^= sc->count1; + rk[ 87] ^= SPH_T32(~sc->count0); + } else if (u == 124) { + rk[124] ^= sc->count0; + rk[127] ^= SPH_T32(~sc->count1); + } + u += 4; + } + for (s = 0; s < 4; s ++) { + rk[u + 0] = rk[u - 16] ^ rk[u - 3]; + rk[u + 1] = rk[u - 15] ^ rk[u - 2]; + rk[u + 2] = rk[u - 14] ^ rk[u - 1]; + rk[u + 3] = rk[u - 13] ^ rk[u - 0]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + u = 0; + for (r = 0; r < 6; r ++) { + sph_u32 x0, x1, x2, x3; + + x0 = p4 ^ rk[u ++]; + x1 = p5 ^ rk[u ++]; + x2 = p6 ^ rk[u ++]; + x3 = p7 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + x0 = p0 ^ rk[u ++]; + x1 = p1 ^ rk[u ++]; + x2 = p2 ^ rk[u ++]; + x3 = p3 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 x0, x1, x2, x3; + sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7; + sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + /* round 0 */ + rk0 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk0; + rk1 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk1; + rk2 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk2; + rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk4; + rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk5; + rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk6; + rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk8; + rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk9; + rkA = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rkA; + rkB = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 1 */ + rkC = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 = p0 ^ rkC; + rkD = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 = p1 ^ rkD; + rkE = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 = p2 ^ rkE; + rkF = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC ^ sc->count0; + rk1 ^= rkD ^ SPH_T32(~sc->count1); + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 4 */ + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 = p4 ^ rk0; + x1 = p5 ^ rk1; + x2 = p6 ^ rk2; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5 ^ sc->count1; + rkA ^= rk6 ^ SPH_T32(~sc->count0); + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 5 */ + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 = p0 ^ rkC; + x1 = p1 ^ rkD; + x2 = p2 ^ rkE; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 6 */ + rk8 ^= rk5; + x0 = p4 ^ rk8; + rk9 ^= rk6; + x1 = p5 ^ rk9; + rkA ^= rk7; + x2 = p6 ^ rkA; + rkB ^= rk8; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 7 */ + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2 ^ sc->count1; + rk7 ^= rk3 ^ SPH_T32(~sc->count0); + x0 = p0 ^ rk4; + x1 = p1 ^ rk5; + x2 = p2 ^ rk6; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 8 */ + rk0 ^= rkD; + x0 = p4 ^ rk0; + rk1 ^= rkE; + x1 = p5 ^ rk1; + rk2 ^= rkF; + x2 = p6 ^ rk2; + rk3 ^= rk0; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 9 */ + rkC ^= rk9; + x0 = p0 ^ rkC; + rkD ^= rkA; + x1 = p1 ^ rkD; + rkE ^= rkB; + x2 = p2 ^ rkE; + rkF ^= rkC; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 10 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8 ^ sc->count0; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB ^ SPH_T32(~sc->count1); + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 11 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#endif + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 rk[448]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 128); +#else + for (u = 0; u < 32; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 32; + for (;;) { + for (s = 0; s < 4; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 32) { + rk[ 32] ^= sc->count0; + rk[ 33] ^= sc->count1; + rk[ 34] ^= sc->count2; + rk[ 35] ^= SPH_T32(~sc->count3); + } else if (u == 440) { + rk[440] ^= sc->count1; + rk[441] ^= sc->count0; + rk[442] ^= sc->count3; + rk[443] ^= SPH_T32(~sc->count2); + } + u += 4; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 164) { + rk[164] ^= sc->count3; + rk[165] ^= sc->count2; + rk[166] ^= sc->count1; + rk[167] ^= SPH_T32(~sc->count0); + } else if (u == 316) { + rk[316] ^= sc->count2; + rk[317] ^= sc->count3; + rk[318] ^= sc->count0; + rk[319] ^= SPH_T32(~sc->count1); + } + u += 4; + } + if (u == 448) + break; + for (s = 0; s < 8; s ++) { + rk[u + 0] = rk[u - 32] ^ rk[u - 7]; + rk[u + 1] = rk[u - 31] ^ rk[u - 6]; + rk[u + 2] = rk[u - 30] ^ rk[u - 5]; + rk[u + 3] = rk[u - 29] ^ rk[u - 4]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + u = 0; + for (r = 0; r < 14; r ++) { +#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \ + sph_u32 x0, x1, x2, x3; \ + x0 = r0 ^ rk[u ++]; \ + x1 = r1 ^ rk[u ++]; \ + x2 = r2 ^ rk[u ++]; \ + x3 = r3 ^ rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + l0 ^= x0; \ + l1 ^= x1; \ + l2 ^= x2; \ + l3 ^= x3; \ + } while (0) + +#define WROT(a, b, c, d) do { \ + sph_u32 t = d; \ + d = c; \ + c = b; \ + b = a; \ + a = t; \ + } while (0) + + C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7); + C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF); + + WROT(p0, p4, p8, pC); + WROT(p1, p5, p9, pD); + WROT(p2, p6, pA, pE); + WROT(p3, p7, pB, pF); + +#undef C512_ELT +#undef WROT + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; + sc->h[0x8] ^= p8; + sc->h[0x9] ^= p9; + sc->h[0xA] ^= pA; + sc->h[0xB] ^= pB; + sc->h[0xC] ^= pC; + sc->h[0xD] ^= pD; + sc->h[0xE] ^= pE; + sc->h[0xF] ^= pF; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 x0, x1, x2, x3; + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + int r; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + /* round 0 */ + rk00 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk00; + rk01 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk01; + rk02 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk02; + rk03 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk04; + rk05 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk05; + rk06 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk06; + rk07 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk08; + rk09 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk09; + rk0A = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rk0A; + rk0B = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 ^= rk0C; + rk0D = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 ^= rk0D; + rk0E = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 ^= rk0E; + rk0F = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 = sph_dec32le_aligned((const unsigned char *)msg + 64); + x0 = pC ^ rk10; + rk11 = sph_dec32le_aligned((const unsigned char *)msg + 68); + x1 = pD ^ rk11; + rk12 = sph_dec32le_aligned((const unsigned char *)msg + 72); + x2 = pE ^ rk12; + rk13 = sph_dec32le_aligned((const unsigned char *)msg + 76); + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 = sph_dec32le_aligned((const unsigned char *)msg + 80); + x0 ^= rk14; + rk15 = sph_dec32le_aligned((const unsigned char *)msg + 84); + x1 ^= rk15; + rk16 = sph_dec32le_aligned((const unsigned char *)msg + 88); + x2 ^= rk16; + rk17 = sph_dec32le_aligned((const unsigned char *)msg + 92); + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 = sph_dec32le_aligned((const unsigned char *)msg + 96); + x0 ^= rk18; + rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100); + x1 ^= rk19; + rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104); + x2 ^= rk1A; + rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108); + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112); + x0 ^= rk1C; + rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116); + x1 ^= rk1D; + rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120); + x2 ^= rk1E; + rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124); + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + for (r = 0; r < 3; r ++) { + /* round 1, 5, 9 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + if (r == 0) { + rk00 ^= sc->count0; + rk01 ^= sc->count1; + rk02 ^= sc->count2; + rk03 ^= SPH_T32(~sc->count3); + } + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + if (r == 1) { + rk04 ^= sc->count3; + rk05 ^= sc->count2; + rk06 ^= sc->count1; + rk07 ^= SPH_T32(~sc->count0); + } + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + if (r == 2) { + rk1C ^= sc->count2; + rk1D ^= sc->count3; + rk1E ^= sc->count0; + rk1F ^= SPH_T32(~sc->count1); + } + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2, 6, 10 */ + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + } + /* round 13 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14 ^ sc->count1; + rk19 ^= rk15 ^ sc->count0; + rk1A ^= rk16 ^ sc->count3; + rk1B ^= rk17 ^ SPH_T32(~sc->count2); + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p8; + sc->h[0x1] ^= p9; + sc->h[0x2] ^= pA; + sc->h[0x3] ^= pB; + sc->h[0x4] ^= pC; + sc->h[0x5] ^= pD; + sc->h[0x6] ^= pE; + sc->h[0x7] ^= pF; + sc->h[0x8] ^= p0; + sc->h[0x9] ^= p1; + sc->h[0xA] ^= p2; + sc->h[0xB] ^= p3; + sc->h[0xC] ^= p4; + sc->h[0xD] ^= p5; + sc->h[0xE] ^= p6; + sc->h[0xF] ^= p7; +} + +#endif + +static void +shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; +} + +static void +shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0) + sc->count1 = SPH_T32(sc->count1 + 1); + c256(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_small_close(sph_shavite_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 53); + sc->count0 = sc->count1 = 0; + } else if (ptr < 54) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 54 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 64 - ptr); + c256(sc, buf); + memset(buf, 0, 54); + sc->count0 = sc->count1 = 0; + } + sph_enc32le(buf + 54, count0); + sph_enc32le(buf + 58, count1); + buf[62] = out_size_w32 << 5; + buf[63] = out_size_w32 >> 3; + c256(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +static void +shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; + sc->count2 = 0; + sc->count3 = 0; +} + +static void +shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) { + sc->count1 = SPH_T32(sc->count1 + 1); + if (sc->count1 == 0) { + sc->count2 = SPH_T32(sc->count2 + 1); + if (sc->count2 == 0) { + sc->count3 = SPH_T32( + sc->count3 + 1); + } + } + } + c512(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_big_close(sph_shavite_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1, count2, count3; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + count2 = sc->count2; + count3 = sc->count3; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 109); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } else if (ptr < 110) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 110 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 128 - ptr); + c512(sc, buf); + memset(buf, 0, 110); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } + sph_enc32le(buf + 110, count0); + sph_enc32le(buf + 114, count1); + sph_enc32le(buf + 118, count2); + sph_enc32le(buf + 122, count3); + buf[126] = out_size_w32 << 5; + buf[127] = out_size_w32 >> 3; + c512(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +/* see sph_shavite.h */ +void +sph_shavite224_init(void *cc) +{ + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite224_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite256_init(void *cc) +{ + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite256_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite384_init(void *cc) +{ + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite384_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite512_init(void *cc) +{ + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite512_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 16); + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 16); + shavite_big_init(cc, IV512); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sph/simd.c b/sph/simd.c new file mode 100644 index 0000000..2c80626 --- /dev/null +++ b/sph/simd.c @@ -0,0 +1,1799 @@ +/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SIMD implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_simd.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD +#define SPH_SMALL_FOOTPRINT_SIMD 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +typedef sph_u32 u32; +typedef sph_s32 s32; +#define C32 SPH_C32 +#define T32 SPH_T32 +#define ROL32 SPH_ROTL32 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +/* + * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. + */ +static const s32 alpha_tab[] = { + 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, + 190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28, + 120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180, + 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, + 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, + 235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224, + 189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155, + 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, + 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, + 81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250, + 227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212, + 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, + 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, + 134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201, + 17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154, + 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, + 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, + 44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66, + 136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204, + 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, + 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, + 95, 40, 98, 163 +}; + +/* + * Ranges: + * REDS1: from -32768..98302 to -383..383 + * REDS2: from -2^31..2^31-1 to -32768..98302 + */ +#define REDS1(x) (((x) & 0xFF) - ((x) >> 8)) +#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) + +/* + * If, upon entry, the values of q[] are all in the -N..N range (where + * N >= 98302) then the new values of q[] are in the -2N..2N range. + * + * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. + */ +#define FFT_LOOP(rb, hk, as, id) do { \ + size_t u, v; \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + (hk)]; \ + q[(rb)] = m + n; \ + q[(rb) + (hk)] = m - n; \ + u = v = 0; \ + goto id; \ + for (; u < (hk); u += 4, v += 4 * (as)) { \ + s32 t; \ + m = q[(rb) + u + 0]; \ + n = q[(rb) + u + 0 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 0 * (as)]); \ + q[(rb) + u + 0] = m + t; \ + q[(rb) + u + 0 + (hk)] = m - t; \ + id: \ + m = q[(rb) + u + 1]; \ + n = q[(rb) + u + 1 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 1 * (as)]); \ + q[(rb) + u + 1] = m + t; \ + q[(rb) + u + 1 + (hk)] = m - t; \ + m = q[(rb) + u + 2]; \ + n = q[(rb) + u + 2 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 2 * (as)]); \ + q[(rb) + u + 2] = m + t; \ + q[(rb) + u + 2 + (hk)] = m - t; \ + m = q[(rb) + u + 3]; \ + n = q[(rb) + u + 3 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 3 * (as)]); \ + q[(rb) + u + 3] = m + t; \ + q[(rb) + u + 3 + (hk)] = m - t; \ + } \ + } while (0) + +/* + * Output ranges: + * d0: min= 0 max= 1020 + * d1: min= -67 max= 4587 + * d2: min=-4335 max= 4335 + * d3: min=-4147 max= 507 + * d4: min= -510 max= 510 + * d5: min= -252 max= 4402 + * d6: min=-4335 max= 4335 + * d7: min=-4332 max= 322 + */ +#define FFT8(xb, xs, d) do { \ + s32 x0 = x[(xb)]; \ + s32 x1 = x[(xb) + (xs)]; \ + s32 x2 = x[(xb) + 2 * (xs)]; \ + s32 x3 = x[(xb) + 3 * (xs)]; \ + s32 a0 = x0 + x2; \ + s32 a1 = x0 + (x2 << 4); \ + s32 a2 = x0 - x2; \ + s32 a3 = x0 - (x2 << 4); \ + s32 b0 = x1 + x3; \ + s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ + s32 b2 = (x1 << 4) - (x3 << 4); \ + s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ + d ## 0 = a0 + b0; \ + d ## 1 = a1 + b1; \ + d ## 2 = a2 + b2; \ + d ## 3 = a3 + b3; \ + d ## 4 = a0 - b0; \ + d ## 5 = a1 - b1; \ + d ## 6 = a2 - b2; \ + d ## 7 = a3 - b3; \ + } while (0) + +/* + * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced + * to some shifting. + * + * Output: within -591471..591723 + */ +#define FFT16(xb, xs, rb) do { \ + s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ + s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ + FFT8(xb, (xs) << 1, d1_); \ + FFT8((xb) + (xs), (xs) << 1, d2_); \ + q[(rb) + 0] = d1_0 + d2_0; \ + q[(rb) + 1] = d1_1 + (d2_1 << 1); \ + q[(rb) + 2] = d1_2 + (d2_2 << 2); \ + q[(rb) + 3] = d1_3 + (d2_3 << 3); \ + q[(rb) + 4] = d1_4 + (d2_4 << 4); \ + q[(rb) + 5] = d1_5 + (d2_5 << 5); \ + q[(rb) + 6] = d1_6 + (d2_6 << 6); \ + q[(rb) + 7] = d1_7 + (d2_7 << 7); \ + q[(rb) + 8] = d1_0 - d2_0; \ + q[(rb) + 9] = d1_1 - (d2_1 << 1); \ + q[(rb) + 10] = d1_2 - (d2_2 << 2); \ + q[(rb) + 11] = d1_3 - (d2_3 << 3); \ + q[(rb) + 12] = d1_4 - (d2_4 << 4); \ + q[(rb) + 13] = d1_5 - (d2_5 << 5); \ + q[(rb) + 14] = d1_6 - (d2_6 << 6); \ + q[(rb) + 15] = d1_7 - (d2_7 << 7); \ + } while (0) + +/* + * Output range: |q| <= 1183446 + */ +#define FFT32(xb, xs, rb, id) do { \ + FFT16(xb, (xs) << 1, rb); \ + FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ + FFT_LOOP(rb, 16, 8, id); \ + } while (0) + +/* + * Output range: |q| <= 2366892 + */ +#define FFT64(xb, xs, rb, id) do { \ + FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \ + FFT_LOOP(rb, 32, 4, id); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +static void +fft32(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT16(0, xd, 0); + FFT16(xs, xd, 16); + FFT_LOOP(0, 16, 8, label_); +} + +#define FFT128(xb, xs, rb, id) do { \ + fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \ + FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \ + fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \ + fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \ + FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \ + FFT_LOOP(rb, 64, 2, XCAT(id, a)); \ + } while (0) + +#else + +/* + * Output range: |q| <= 4733784 + */ +#define FFT128(xb, xs, rb, id) do { \ + FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \ + FFT_LOOP(rb, 64, 2, id); \ + } while (0) + +#endif + +/* + * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression + * function which does not fit in the 32 kB L1 cache of a typical x86 + * Intel. We therefore add a function call layer at the FFT64 level. + */ + +static void +fft64(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT32(0, xd, 0, label_a); + FFT32(xs, xd, 32, label_b); + FFT_LOOP(0, 32, 4, label_); +} + +/* + * Output range: |q| <= 9467568 + */ +#define FFT256(xb, xs, rb, id) do { \ + fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \ + FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \ + fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \ + fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \ + FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \ + FFT_LOOP(rb, 128, 1, XCAT(id, a)); \ + } while (0) + +/* + * alpha^(127*i) mod 257 + */ +static const unsigned short yoff_s_n[] = { + 1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29, + 15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178, + 225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100, + 34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215, + 253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141, + 197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59, + 128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114, + 121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168, + 16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207, + 240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21, + 2, 196, 190, 116, 60, 226, 46, 139 +}; + +/* + * alpha^(127*i) + alpha^(125*i) mod 257 + */ +static const unsigned short yoff_s_f[] = { + 2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3, + 49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65, + 96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113, + 17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143, + 189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6, + 77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95, + 160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53, + 181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150, + 0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109, + 210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30, + 10, 146, 117, 251, 180, 247, 236, 108 +}; + +/* + * beta^(255*i) mod 257 + */ +static const unsigned short yoff_b_n[] = { + 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, + 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, + 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, + 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, + 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, + 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, + 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, + 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, + 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, + 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, + 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, + 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, + 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, + 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, + 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, + 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, + 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, + 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, + 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, + 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, + 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, + 46, 45, 139, 41 +}; + +/* + * beta^(255*i) + beta^(253*i) mod 257 + */ +static const unsigned short yoff_b_f[] = { + 2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20, + 111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89, + 49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239, + 253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79, + 96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226, + 248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115, + 17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208, + 57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40, + 189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45, + 187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107, + 77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210, + 139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6, + 160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190, + 106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208, + 181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127, + 96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193, + 0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44, + 245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9, + 210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94, + 53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185, + 10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156, + 236, 192, 108, 86 +}; + +#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ + + ((u32)((h) * (mm)) << 16)) + +#define W_SMALL(sb, o1, o2, mm) \ + (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm) + +#define WS_0_0 W_SMALL( 4, 0, 1, 185) +#define WS_0_1 W_SMALL( 6, 0, 1, 185) +#define WS_0_2 W_SMALL( 0, 0, 1, 185) +#define WS_0_3 W_SMALL( 2, 0, 1, 185) +#define WS_0_4 W_SMALL( 7, 0, 1, 185) +#define WS_0_5 W_SMALL( 5, 0, 1, 185) +#define WS_0_6 W_SMALL( 3, 0, 1, 185) +#define WS_0_7 W_SMALL( 1, 0, 1, 185) +#define WS_1_0 W_SMALL(15, 0, 1, 185) +#define WS_1_1 W_SMALL(11, 0, 1, 185) +#define WS_1_2 W_SMALL(12, 0, 1, 185) +#define WS_1_3 W_SMALL( 8, 0, 1, 185) +#define WS_1_4 W_SMALL( 9, 0, 1, 185) +#define WS_1_5 W_SMALL(13, 0, 1, 185) +#define WS_1_6 W_SMALL(10, 0, 1, 185) +#define WS_1_7 W_SMALL(14, 0, 1, 185) +#define WS_2_0 W_SMALL(17, -128, -64, 233) +#define WS_2_1 W_SMALL(18, -128, -64, 233) +#define WS_2_2 W_SMALL(23, -128, -64, 233) +#define WS_2_3 W_SMALL(20, -128, -64, 233) +#define WS_2_4 W_SMALL(22, -128, -64, 233) +#define WS_2_5 W_SMALL(21, -128, -64, 233) +#define WS_2_6 W_SMALL(16, -128, -64, 233) +#define WS_2_7 W_SMALL(19, -128, -64, 233) +#define WS_3_0 W_SMALL(30, -191, -127, 233) +#define WS_3_1 W_SMALL(24, -191, -127, 233) +#define WS_3_2 W_SMALL(25, -191, -127, 233) +#define WS_3_3 W_SMALL(31, -191, -127, 233) +#define WS_3_4 W_SMALL(27, -191, -127, 233) +#define WS_3_5 W_SMALL(29, -191, -127, 233) +#define WS_3_6 W_SMALL(28, -191, -127, 233) +#define WS_3_7 W_SMALL(26, -191, -127, 233) + +#define W_BIG(sb, o1, o2, mm) \ + (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) + +#define WB_0_0 W_BIG( 4, 0, 1, 185) +#define WB_0_1 W_BIG( 6, 0, 1, 185) +#define WB_0_2 W_BIG( 0, 0, 1, 185) +#define WB_0_3 W_BIG( 2, 0, 1, 185) +#define WB_0_4 W_BIG( 7, 0, 1, 185) +#define WB_0_5 W_BIG( 5, 0, 1, 185) +#define WB_0_6 W_BIG( 3, 0, 1, 185) +#define WB_0_7 W_BIG( 1, 0, 1, 185) +#define WB_1_0 W_BIG(15, 0, 1, 185) +#define WB_1_1 W_BIG(11, 0, 1, 185) +#define WB_1_2 W_BIG(12, 0, 1, 185) +#define WB_1_3 W_BIG( 8, 0, 1, 185) +#define WB_1_4 W_BIG( 9, 0, 1, 185) +#define WB_1_5 W_BIG(13, 0, 1, 185) +#define WB_1_6 W_BIG(10, 0, 1, 185) +#define WB_1_7 W_BIG(14, 0, 1, 185) +#define WB_2_0 W_BIG(17, -256, -128, 233) +#define WB_2_1 W_BIG(18, -256, -128, 233) +#define WB_2_2 W_BIG(23, -256, -128, 233) +#define WB_2_3 W_BIG(20, -256, -128, 233) +#define WB_2_4 W_BIG(22, -256, -128, 233) +#define WB_2_5 W_BIG(21, -256, -128, 233) +#define WB_2_6 W_BIG(16, -256, -128, 233) +#define WB_2_7 W_BIG(19, -256, -128, 233) +#define WB_3_0 W_BIG(30, -383, -255, 233) +#define WB_3_1 W_BIG(24, -383, -255, 233) +#define WB_3_2 W_BIG(25, -383, -255, 233) +#define WB_3_3 W_BIG(31, -383, -255, 233) +#define WB_3_4 W_BIG(27, -383, -255, 233) +#define WB_3_5 W_BIG(29, -383, -255, 233) +#define WB_3_6 W_BIG(28, -383, -255, 233) +#define WB_3_7 W_BIG(26, -383, -255, 233) + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) + +#define PP4_0_0 1 +#define PP4_0_1 0 +#define PP4_0_2 3 +#define PP4_0_3 2 +#define PP4_1_0 2 +#define PP4_1_1 3 +#define PP4_1_2 0 +#define PP4_1_3 1 +#define PP4_2_0 3 +#define PP4_2_1 2 +#define PP4_2_2 1 +#define PP4_2_3 0 + +#define PP8_0_0 1 +#define PP8_0_1 0 +#define PP8_0_2 3 +#define PP8_0_3 2 +#define PP8_0_4 5 +#define PP8_0_5 4 +#define PP8_0_6 7 +#define PP8_0_7 6 + +#define PP8_1_0 6 +#define PP8_1_1 7 +#define PP8_1_2 4 +#define PP8_1_3 5 +#define PP8_1_4 2 +#define PP8_1_5 3 +#define PP8_1_6 0 +#define PP8_1_7 1 + +#define PP8_2_0 2 +#define PP8_2_1 3 +#define PP8_2_2 0 +#define PP8_2_3 1 +#define PP8_2_4 6 +#define PP8_2_5 7 +#define PP8_2_6 4 +#define PP8_2_7 5 + +#define PP8_3_0 3 +#define PP8_3_1 2 +#define PP8_3_2 1 +#define PP8_3_3 0 +#define PP8_3_4 7 +#define PP8_3_5 6 +#define PP8_3_6 5 +#define PP8_3_7 4 + +#define PP8_4_0 5 +#define PP8_4_1 4 +#define PP8_4_2 7 +#define PP8_4_3 6 +#define PP8_4_4 1 +#define PP8_4_5 0 +#define PP8_4_6 3 +#define PP8_4_7 2 + +#define PP8_5_0 7 +#define PP8_5_1 6 +#define PP8_5_2 5 +#define PP8_5_3 4 +#define PP8_5_4 3 +#define PP8_5_5 2 +#define PP8_5_6 1 +#define PP8_5_7 0 + +#define PP8_6_0 4 +#define PP8_6_1 5 +#define PP8_6_2 6 +#define PP8_6_3 7 +#define PP8_6_4 0 +#define PP8_6_5 1 +#define PP8_6_6 2 +#define PP8_6_7 3 + +#if SPH_SIMD_NOCOPY + +#define DECL_STATE_SMALL +#define READ_STATE_SMALL(sc) +#define WRITE_STATE_SMALL(sc) +#define DECL_STATE_BIG +#define READ_STATE_BIG(sc) +#define WRITE_STATE_BIG(sc) + +#else + +#define DECL_STATE_SMALL \ + u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3; + +#define READ_STATE_SMALL(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + B0 = (sc)->state[ 4]; \ + B1 = (sc)->state[ 5]; \ + B2 = (sc)->state[ 6]; \ + B3 = (sc)->state[ 7]; \ + C0 = (sc)->state[ 8]; \ + C1 = (sc)->state[ 9]; \ + C2 = (sc)->state[10]; \ + C3 = (sc)->state[11]; \ + D0 = (sc)->state[12]; \ + D1 = (sc)->state[13]; \ + D2 = (sc)->state[14]; \ + D3 = (sc)->state[15]; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = B0; \ + (sc)->state[ 5] = B1; \ + (sc)->state[ 6] = B2; \ + (sc)->state[ 7] = B3; \ + (sc)->state[ 8] = C0; \ + (sc)->state[ 9] = C1; \ + (sc)->state[10] = C2; \ + (sc)->state[11] = C3; \ + (sc)->state[12] = D0; \ + (sc)->state[13] = D1; \ + (sc)->state[14] = D2; \ + (sc)->state[15] = D3; \ + } while (0) + +#define DECL_STATE_BIG \ + u32 A0, A1, A2, A3, A4, A5, A6, A7; \ + u32 B0, B1, B2, B3, B4, B5, B6, B7; \ + u32 C0, C1, C2, C3, C4, C5, C6, C7; \ + u32 D0, D1, D2, D3, D4, D5, D6, D7; + +#define READ_STATE_BIG(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + A4 = (sc)->state[ 4]; \ + A5 = (sc)->state[ 5]; \ + A6 = (sc)->state[ 6]; \ + A7 = (sc)->state[ 7]; \ + B0 = (sc)->state[ 8]; \ + B1 = (sc)->state[ 9]; \ + B2 = (sc)->state[10]; \ + B3 = (sc)->state[11]; \ + B4 = (sc)->state[12]; \ + B5 = (sc)->state[13]; \ + B6 = (sc)->state[14]; \ + B7 = (sc)->state[15]; \ + C0 = (sc)->state[16]; \ + C1 = (sc)->state[17]; \ + C2 = (sc)->state[18]; \ + C3 = (sc)->state[19]; \ + C4 = (sc)->state[20]; \ + C5 = (sc)->state[21]; \ + C6 = (sc)->state[22]; \ + C7 = (sc)->state[23]; \ + D0 = (sc)->state[24]; \ + D1 = (sc)->state[25]; \ + D2 = (sc)->state[26]; \ + D3 = (sc)->state[27]; \ + D4 = (sc)->state[28]; \ + D5 = (sc)->state[29]; \ + D6 = (sc)->state[30]; \ + D7 = (sc)->state[31]; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = A4; \ + (sc)->state[ 5] = A5; \ + (sc)->state[ 6] = A6; \ + (sc)->state[ 7] = A7; \ + (sc)->state[ 8] = B0; \ + (sc)->state[ 9] = B1; \ + (sc)->state[10] = B2; \ + (sc)->state[11] = B3; \ + (sc)->state[12] = B4; \ + (sc)->state[13] = B5; \ + (sc)->state[14] = B6; \ + (sc)->state[15] = B7; \ + (sc)->state[16] = C0; \ + (sc)->state[17] = C1; \ + (sc)->state[18] = C2; \ + (sc)->state[19] = C3; \ + (sc)->state[20] = C4; \ + (sc)->state[21] = C5; \ + (sc)->state[22] = C6; \ + (sc)->state[23] = C7; \ + (sc)->state[24] = D0; \ + (sc)->state[25] = D1; \ + (sc)->state[26] = D2; \ + (sc)->state[27] = D3; \ + (sc)->state[28] = D4; \ + (sc)->state[29] = D5; \ + (sc)->state[30] = D6; \ + (sc)->state[31] = D7; \ + } while (0) + +#endif + +#define STEP_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA ## n; \ + } while (0) + +#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + STEP_ELT(0, w0, fun, s, pp4b); \ + STEP_ELT(1, w1, fun, s, pp4b); \ + STEP_ELT(2, w2, fun, s, pp4b); \ + STEP_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + u32 tA4 = ROL32(A4, r); \ + u32 tA5 = ROL32(A5, r); \ + u32 tA6 = ROL32(A6, r); \ + u32 tA7 = ROL32(A7, r); \ + STEP_ELT(0, w0, fun, s, pp8b); \ + STEP_ELT(1, w1, fun, s, pp8b); \ + STEP_ELT(2, w2, fun, s, pp8b); \ + STEP_ELT(3, w3, fun, s, pp8b); \ + STEP_ELT(4, w4, fun, s, pp8b); \ + STEP_ELT(5, w5, fun, s, pp8b); \ + STEP_ELT(6, w6, fun, s, pp8b); \ + STEP_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +#define M3_0_0 0_ +#define M3_1_0 1_ +#define M3_2_0 2_ +#define M3_3_0 0_ +#define M3_4_0 1_ +#define M3_5_0 2_ +#define M3_6_0 0_ +#define M3_7_0 1_ + +#define M3_0_1 1_ +#define M3_1_1 2_ +#define M3_2_1 0_ +#define M3_3_1 1_ +#define M3_4_1 2_ +#define M3_5_1 0_ +#define M3_6_1 1_ +#define M3_7_1 2_ + +#define M3_0_2 2_ +#define M3_1_2 0_ +#define M3_2_2 1_ +#define M3_3_2 2_ +#define M3_4_2 0_ +#define M3_5_2 1_ +#define M3_6_2 2_ +#define M3_7_2 0_ + +#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b) + +#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \ + STEP_SMALL_(WS_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \ + } while (0) + +#define M7_0_0 0_ +#define M7_1_0 1_ +#define M7_2_0 2_ +#define M7_3_0 3_ +#define M7_4_0 4_ +#define M7_5_0 5_ +#define M7_6_0 6_ +#define M7_7_0 0_ + +#define M7_0_1 1_ +#define M7_1_1 2_ +#define M7_2_1 3_ +#define M7_3_1 4_ +#define M7_4_1 5_ +#define M7_5_1 6_ +#define M7_6_1 0_ +#define M7_7_1 1_ + +#define M7_0_2 2_ +#define M7_1_2 3_ +#define M7_2_2 4_ +#define M7_3_2 5_ +#define M7_4_2 6_ +#define M7_5_2 0_ +#define M7_6_2 1_ +#define M7_7_2 2_ + +#define M7_0_3 3_ +#define M7_1_3 4_ +#define M7_2_3 5_ +#define M7_3_3 6_ +#define M7_4_3 0_ +#define M7_5_3 1_ +#define M7_6_3 2_ +#define M7_7_3 3_ + +#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) + +#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ + STEP_BIG_(WB_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define B0 state[ 4] +#define B1 state[ 5] +#define B2 state[ 6] +#define B3 state[ 7] +#define C0 state[ 8] +#define C1 state[ 9] +#define C2 state[10] +#define C3 state[11] +#define D0 state[12] +#define D1 state[13] +#define D2 state[14] +#define D3 state[15] + +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + +#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA[4]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + STEP2_ELT(0, w0, fun, s, pp4b); \ + STEP2_ELT(1, w1, fun, s, pp4b); \ + STEP2_ELT(2, w2, fun, s, pp4b); \ + STEP2_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +static void +one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 }; + + STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]); + STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]); + STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]); + STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]); + STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]); + STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]); + STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]); + STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]); +} + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + u32 w[32]; + u32 state[16]; + size_t u; + + static const size_t wsp[32] = { + 4 << 3, 6 << 3, 0 << 3, 2 << 3, + 7 << 3, 5 << 3, 3 << 3, 1 << 3, + 15 << 3, 11 << 3, 12 << 3, 8 << 3, + 9 << 3, 13 << 3, 10 << 3, 14 << 3, + 17 << 3, 18 << 3, 23 << 3, 20 << 3, + 22 << 3, 21 << 3, 16 << 3, 19 << 3, + 30 << 3, 24 << 3, 25 << 3, 31 << 3, + 27 << 3, 29 << 3, 28 << 3, 26 << 3 + }; + + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 16; i += 4) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + } + +#define WSREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 32; u += 4) { \ + size_t v = wsp[(u >> 2) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + } \ + } while (0) + + WSREAD( 0, 0, 1, 185); + one_round_small(state, w, 0, 3, 23, 17, 27); + WSREAD( 8, 0, 1, 185); + one_round_small(state, w, 2, 28, 19, 22, 7); + WSREAD(16, -128, -64, 233); + one_round_small(state, w, 1, 29, 9, 15, 5); + WSREAD(24, -191, -127, 233); + one_round_small(state, w, 0, 4, 13, 10, 25); + +#undef WSREAD + + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define B0 (sc->state[ 4]) +#define B1 (sc->state[ 5]) +#define B2 (sc->state[ 6]) +#define B3 (sc->state[ 7]) +#define C0 (sc->state[ 8]) +#define C1 (sc->state[ 9]) +#define C2 (sc->state[10]) +#define C3 (sc->state[11]) +#define D0 (sc->state[12]) +#define D1 (sc->state[13]) +#define D2 (sc->state[14]) +#define D3 (sc->state[15]) +#endif + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + DECL_STATE_SMALL +#if SPH_SIMD_NOCOPY + sph_u32 saved[16]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_SMALL(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + B0 ^= sph_dec32le_aligned(x + 16); + B1 ^= sph_dec32le_aligned(x + 20); + B2 ^= sph_dec32le_aligned(x + 24); + B3 ^= sph_dec32le_aligned(x + 28); + C0 ^= sph_dec32le_aligned(x + 32); + C1 ^= sph_dec32le_aligned(x + 36); + C2 ^= sph_dec32le_aligned(x + 40); + C3 ^= sph_dec32le_aligned(x + 44); + D0 ^= sph_dec32le_aligned(x + 48); + D1 ^= sph_dec32le_aligned(x + 52); + D2 ^= sph_dec32le_aligned(x + 56); + D3 ^= sph_dec32le_aligned(x + 60); + ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27); + ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7); + ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5); + ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(saved[12], saved[13], saved[14], saved[15], + IF, 25, 4, PP4_2_); +#else + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + WRITE_STATE_SMALL(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#endif + +#endif + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define A4 state[ 4] +#define A5 state[ 5] +#define A6 state[ 6] +#define A7 state[ 7] +#define B0 state[ 8] +#define B1 state[ 9] +#define B2 state[10] +#define B3 state[11] +#define B4 state[12] +#define B5 state[13] +#define B6 state[14] +#define B7 state[15] +#define C0 state[16] +#define C1 state[17] +#define C2 state[18] +#define C3 state[19] +#define C4 state[20] +#define C5 state[21] +#define C6 state[22] +#define C7 state[23] +#define D0 state[24] +#define D1 state[25] +#define D2 state[26] +#define D3 state[27] +#define D4 state[28] +#define D5 state[29] +#define D6 state[30] +#define D7 state[31] + +/* + * Not needed -- already defined for SIMD-224 / SIMD-256 + * +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + */ + +#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA[8]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + tA[4] = ROL32(A4, r); \ + tA[5] = ROL32(A5, r); \ + tA[6] = ROL32(A6, r); \ + tA[7] = ROL32(A7, r); \ + STEP2_ELT(0, w0, fun, s, pp8b); \ + STEP2_ELT(1, w1, fun, s, pp8b); \ + STEP2_ELT(2, w2, fun, s, pp8b); \ + STEP2_ELT(3, w3, fun, s, pp8b); \ + STEP2_ELT(4, w4, fun, s, pp8b); \ + STEP2_ELT(5, w5, fun, s, pp8b); \ + STEP2_ELT(6, w6, fun, s, pp8b); \ + STEP2_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +static void +one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 }; + + STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7], + IF, p0, p1, pp8k[isp + 0]); + STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15], + IF, p1, p2, pp8k[isp + 1]); + STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23], + IF, p2, p3, pp8k[isp + 2]); + STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31], + IF, p3, p0, pp8k[isp + 3]); + STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39], + MAJ, p0, p1, pp8k[isp + 4]); + STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47], + MAJ, p1, p2, pp8k[isp + 5]); + STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55], + MAJ, p2, p3, pp8k[isp + 6]); + STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63], + MAJ, p3, p0, pp8k[isp + 7]); +} + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + u32 w[64]; + u32 state[32]; + size_t u; + + static const size_t wbp[32] = { + 4 << 4, 6 << 4, 0 << 4, 2 << 4, + 7 << 4, 5 << 4, 3 << 4, 1 << 4, + 15 << 4, 11 << 4, 12 << 4, 8 << 4, + 9 << 4, 13 << 4, 10 << 4, 14 << 4, + 17 << 4, 18 << 4, 23 << 4, 20 << 4, + 22 << 4, 21 << 4, 16 << 4, 19 << 4, + 30 << 4, 24 << 4, 25 << 4, 31 << 4, + 27 << 4, 29 << 4, 28 << 4, 26 << 4 + }; + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 32; i += 8) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + state[i + 4] = sc->state[i + 4] + ^ sph_dec32le_aligned(x + 4 * (i + 4)); + state[i + 5] = sc->state[i + 5] + ^ sph_dec32le_aligned(x + 4 * (i + 5)); + state[i + 6] = sc->state[i + 6] + ^ sph_dec32le_aligned(x + 4 * (i + 6)); + state[i + 7] = sc->state[i + 7] + ^ sph_dec32le_aligned(x + 4 * (i + 7)); + } + +#define WBREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 64; u += 8) { \ + size_t v = wbp[(u >> 3) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \ + q[v + 2 * 4 + (o2)], mm); \ + w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \ + q[v + 2 * 5 + (o2)], mm); \ + w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \ + q[v + 2 * 6 + (o2)], mm); \ + w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \ + q[v + 2 * 7 + (o2)], mm); \ + } \ + } while (0) + + WBREAD( 0, 0, 1, 185); + one_round_big(state, w, 0, 3, 23, 17, 27); + WBREAD( 8, 0, 1, 185); + one_round_big(state, w, 1, 28, 19, 22, 7); + WBREAD(16, -256, -128, 233); + one_round_big(state, w, 2, 29, 9, 15, 5); + WBREAD(24, -383, -255, 233); + one_round_big(state, w, 3, 4, 13, 10, 25); + +#undef WBREAD + + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define A4 (sc->state[ 4]) +#define A5 (sc->state[ 5]) +#define A6 (sc->state[ 6]) +#define A7 (sc->state[ 7]) +#define B0 (sc->state[ 8]) +#define B1 (sc->state[ 9]) +#define B2 (sc->state[10]) +#define B3 (sc->state[11]) +#define B4 (sc->state[12]) +#define B5 (sc->state[13]) +#define B6 (sc->state[14]) +#define B7 (sc->state[15]) +#define C0 (sc->state[16]) +#define C1 (sc->state[17]) +#define C2 (sc->state[18]) +#define C3 (sc->state[19]) +#define C4 (sc->state[20]) +#define C5 (sc->state[21]) +#define C6 (sc->state[22]) +#define C7 (sc->state[23]) +#define D0 (sc->state[24]) +#define D1 (sc->state[25]) +#define D2 (sc->state[26]) +#define D3 (sc->state[27]) +#define D4 (sc->state[28]) +#define D5 (sc->state[29]) +#define D6 (sc->state[30]) +#define D7 (sc->state[31]) +#endif + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + DECL_STATE_BIG +#if SPH_SIMD_NOCOPY + sph_u32 saved[32]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_BIG(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + A4 ^= sph_dec32le_aligned(x + 16); + A5 ^= sph_dec32le_aligned(x + 20); + A6 ^= sph_dec32le_aligned(x + 24); + A7 ^= sph_dec32le_aligned(x + 28); + B0 ^= sph_dec32le_aligned(x + 32); + B1 ^= sph_dec32le_aligned(x + 36); + B2 ^= sph_dec32le_aligned(x + 40); + B3 ^= sph_dec32le_aligned(x + 44); + B4 ^= sph_dec32le_aligned(x + 48); + B5 ^= sph_dec32le_aligned(x + 52); + B6 ^= sph_dec32le_aligned(x + 56); + B7 ^= sph_dec32le_aligned(x + 60); + C0 ^= sph_dec32le_aligned(x + 64); + C1 ^= sph_dec32le_aligned(x + 68); + C2 ^= sph_dec32le_aligned(x + 72); + C3 ^= sph_dec32le_aligned(x + 76); + C4 ^= sph_dec32le_aligned(x + 80); + C5 ^= sph_dec32le_aligned(x + 84); + C6 ^= sph_dec32le_aligned(x + 88); + C7 ^= sph_dec32le_aligned(x + 92); + D0 ^= sph_dec32le_aligned(x + 96); + D1 ^= sph_dec32le_aligned(x + 100); + D2 ^= sph_dec32le_aligned(x + 104); + D3 ^= sph_dec32le_aligned(x + 108); + D4 ^= sph_dec32le_aligned(x + 112); + D5 ^= sph_dec32le_aligned(x + 116); + D6 ^= sph_dec32le_aligned(x + 120); + D7 ^= sph_dec32le_aligned(x + 124); + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_BIG( + saved[ 0], saved[ 1], saved[ 2], saved[ 3], + saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + saved[ 8], saved[ 9], saved[10], saved[11], + saved[12], saved[13], saved[14], saved[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + saved[16], saved[17], saved[18], saved[19], + saved[20], saved[21], saved[22], saved[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + saved[24], saved[25], saved[26], saved[27], + saved[28], saved[29], saved[30], saved[31], + IF, 25, 4, PP8_0_); +#else + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + WRITE_STATE_BIG(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 +#endif + +#endif + +static const u32 IV224[] = { + C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53), + C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96), + C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6), + C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8) +}; + +static const u32 IV256[] = { + C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9), + C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3), + C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9), + C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1) +}; + +static const u32 IV384[] = { + C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B), + C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1), + C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A), + C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8), + C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2), + C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462), + C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5), + C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71) +}; + +static const u32 IV512[] = { + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22) +}; + +static void +init_small(void *cc, const u32 *iv) +{ + sph_simd_small_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +init_big(void *cc, const u32 *iv) +{ + sph_simd_big_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +update_small(void *cc, const void *data, size_t len) +{ + sph_simd_small_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_small(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +update_big(void *cc, const void *data, size_t len) +{ + sph_simd_big_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_big(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +encode_count_small(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 9); + high = T32(high << 9) + (low >> 23); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +encode_count_big(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 10); + high = T32(high << 10) + (low >> 22); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_small_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_small(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_small(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +static void +finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_big_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_big(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_big(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +void +sph_simd224_init(void *cc) +{ + init_small(cc, IV224); +} + +void +sph_simd224(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd224_close(void *cc, void *dst) +{ + sph_simd224_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 7); + sph_simd224_init(cc); +} + +void +sph_simd256_init(void *cc) +{ + init_small(cc, IV256); +} + +void +sph_simd256(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd256_close(void *cc, void *dst) +{ + sph_simd256_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 8); + sph_simd256_init(cc); +} + +void +sph_simd384_init(void *cc) +{ + init_big(cc, IV384); +} + +void +sph_simd384(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd384_close(void *cc, void *dst) +{ + sph_simd384_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 12); + sph_simd384_init(cc); +} + +void +sph_simd512_init(void *cc) +{ + init_big(cc, IV512); +} + +void +sph_simd512(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd512_close(void *cc, void *dst) +{ + sph_simd512_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 16); + sph_simd512_init(cc); +} +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sph/skein.c b/sph/skein.c new file mode 100644 index 0000000..2fcfae5 --- /dev/null +++ b/sph/skein.c @@ -0,0 +1,1244 @@ +/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ +/* + * Skein implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_skein.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN +#define SPH_SMALL_FOOTPRINT_SKEIN 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_64 + +#if 0 +/* obsolete */ +/* + * M5_ ## s ## _ ## i evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3). + */ + +#define M5_0_0 0 +#define M5_0_1 1 +#define M5_0_2 2 +#define M5_0_3 3 + +#define M5_1_0 1 +#define M5_1_1 2 +#define M5_1_2 3 +#define M5_1_3 4 + +#define M5_2_0 2 +#define M5_2_1 3 +#define M5_2_2 4 +#define M5_2_3 0 + +#define M5_3_0 3 +#define M5_3_1 4 +#define M5_3_2 0 +#define M5_3_3 1 + +#define M5_4_0 4 +#define M5_4_1 0 +#define M5_4_2 1 +#define M5_4_3 2 + +#define M5_5_0 0 +#define M5_5_1 1 +#define M5_5_2 2 +#define M5_5_3 3 + +#define M5_6_0 1 +#define M5_6_1 2 +#define M5_6_2 3 +#define M5_6_3 4 + +#define M5_7_0 2 +#define M5_7_1 3 +#define M5_7_2 4 +#define M5_7_3 0 + +#define M5_8_0 3 +#define M5_8_1 4 +#define M5_8_2 0 +#define M5_8_3 1 + +#define M5_9_0 4 +#define M5_9_1 0 +#define M5_9_2 1 +#define M5_9_3 2 + +#define M5_10_0 0 +#define M5_10_1 1 +#define M5_10_2 2 +#define M5_10_3 3 + +#define M5_11_0 1 +#define M5_11_1 2 +#define M5_11_2 3 +#define M5_11_3 4 + +#define M5_12_0 2 +#define M5_12_1 3 +#define M5_12_2 4 +#define M5_12_3 0 + +#define M5_13_0 3 +#define M5_13_1 4 +#define M5_13_2 0 +#define M5_13_3 1 + +#define M5_14_0 4 +#define M5_14_1 0 +#define M5_14_2 1 +#define M5_14_3 2 + +#define M5_15_0 0 +#define M5_15_1 1 +#define M5_15_2 2 +#define M5_15_3 3 + +#define M5_16_0 1 +#define M5_16_1 2 +#define M5_16_2 3 +#define M5_16_3 4 + +#define M5_17_0 2 +#define M5_17_1 3 +#define M5_17_2 4 +#define M5_17_3 0 + +#define M5_18_0 3 +#define M5_18_1 4 +#define M5_18_2 0 +#define M5_18_3 1 +#endif + +/* + * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). + */ + +#define M9_0_0 0 +#define M9_0_1 1 +#define M9_0_2 2 +#define M9_0_3 3 +#define M9_0_4 4 +#define M9_0_5 5 +#define M9_0_6 6 +#define M9_0_7 7 + +#define M9_1_0 1 +#define M9_1_1 2 +#define M9_1_2 3 +#define M9_1_3 4 +#define M9_1_4 5 +#define M9_1_5 6 +#define M9_1_6 7 +#define M9_1_7 8 + +#define M9_2_0 2 +#define M9_2_1 3 +#define M9_2_2 4 +#define M9_2_3 5 +#define M9_2_4 6 +#define M9_2_5 7 +#define M9_2_6 8 +#define M9_2_7 0 + +#define M9_3_0 3 +#define M9_3_1 4 +#define M9_3_2 5 +#define M9_3_3 6 +#define M9_3_4 7 +#define M9_3_5 8 +#define M9_3_6 0 +#define M9_3_7 1 + +#define M9_4_0 4 +#define M9_4_1 5 +#define M9_4_2 6 +#define M9_4_3 7 +#define M9_4_4 8 +#define M9_4_5 0 +#define M9_4_6 1 +#define M9_4_7 2 + +#define M9_5_0 5 +#define M9_5_1 6 +#define M9_5_2 7 +#define M9_5_3 8 +#define M9_5_4 0 +#define M9_5_5 1 +#define M9_5_6 2 +#define M9_5_7 3 + +#define M9_6_0 6 +#define M9_6_1 7 +#define M9_6_2 8 +#define M9_6_3 0 +#define M9_6_4 1 +#define M9_6_5 2 +#define M9_6_6 3 +#define M9_6_7 4 + +#define M9_7_0 7 +#define M9_7_1 8 +#define M9_7_2 0 +#define M9_7_3 1 +#define M9_7_4 2 +#define M9_7_5 3 +#define M9_7_6 4 +#define M9_7_7 5 + +#define M9_8_0 8 +#define M9_8_1 0 +#define M9_8_2 1 +#define M9_8_3 2 +#define M9_8_4 3 +#define M9_8_5 4 +#define M9_8_6 5 +#define M9_8_7 6 + +#define M9_9_0 0 +#define M9_9_1 1 +#define M9_9_2 2 +#define M9_9_3 3 +#define M9_9_4 4 +#define M9_9_5 5 +#define M9_9_6 6 +#define M9_9_7 7 + +#define M9_10_0 1 +#define M9_10_1 2 +#define M9_10_2 3 +#define M9_10_3 4 +#define M9_10_4 5 +#define M9_10_5 6 +#define M9_10_6 7 +#define M9_10_7 8 + +#define M9_11_0 2 +#define M9_11_1 3 +#define M9_11_2 4 +#define M9_11_3 5 +#define M9_11_4 6 +#define M9_11_5 7 +#define M9_11_6 8 +#define M9_11_7 0 + +#define M9_12_0 3 +#define M9_12_1 4 +#define M9_12_2 5 +#define M9_12_3 6 +#define M9_12_4 7 +#define M9_12_5 8 +#define M9_12_6 0 +#define M9_12_7 1 + +#define M9_13_0 4 +#define M9_13_1 5 +#define M9_13_2 6 +#define M9_13_3 7 +#define M9_13_4 8 +#define M9_13_5 0 +#define M9_13_6 1 +#define M9_13_7 2 + +#define M9_14_0 5 +#define M9_14_1 6 +#define M9_14_2 7 +#define M9_14_3 8 +#define M9_14_4 0 +#define M9_14_5 1 +#define M9_14_6 2 +#define M9_14_7 3 + +#define M9_15_0 6 +#define M9_15_1 7 +#define M9_15_2 8 +#define M9_15_3 0 +#define M9_15_4 1 +#define M9_15_5 2 +#define M9_15_6 3 +#define M9_15_7 4 + +#define M9_16_0 7 +#define M9_16_1 8 +#define M9_16_2 0 +#define M9_16_3 1 +#define M9_16_4 2 +#define M9_16_5 3 +#define M9_16_6 4 +#define M9_16_7 5 + +#define M9_17_0 8 +#define M9_17_1 0 +#define M9_17_2 1 +#define M9_17_3 2 +#define M9_17_4 3 +#define M9_17_5 4 +#define M9_17_6 5 +#define M9_17_7 6 + +#define M9_18_0 0 +#define M9_18_1 1 +#define M9_18_2 2 +#define M9_18_3 3 +#define M9_18_4 4 +#define M9_18_5 5 +#define M9_18_6 6 +#define M9_18_7 7 + +/* + * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). + */ + +#define M3_0_0 0 +#define M3_0_1 1 +#define M3_1_0 1 +#define M3_1_1 2 +#define M3_2_0 2 +#define M3_2_1 0 +#define M3_3_0 0 +#define M3_3_1 1 +#define M3_4_0 1 +#define M3_4_1 2 +#define M3_5_0 2 +#define M3_5_1 0 +#define M3_6_0 0 +#define M3_6_1 1 +#define M3_7_0 1 +#define M3_7_1 2 +#define M3_8_0 2 +#define M3_8_1 0 +#define M3_9_0 0 +#define M3_9_1 1 +#define M3_10_0 1 +#define M3_10_1 2 +#define M3_11_0 2 +#define M3_11_1 0 +#define M3_12_0 0 +#define M3_12_1 1 +#define M3_13_0 1 +#define M3_13_1 2 +#define M3_14_0 2 +#define M3_14_1 0 +#define M3_15_0 0 +#define M3_15_1 1 +#define M3_16_0 1 +#define M3_16_1 2 +#define M3_17_0 2 +#define M3_17_1 0 +#define M3_18_0 0 +#define M3_18_1 1 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#if 0 +/* obsolete */ +#define SKSI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i)) +#define SKST(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) +#endif + +#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) +#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) + +#if 0 +/* obsolete */ +#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2) do { \ + k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) +#endif + +#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \ + k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ + ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s) do { \ + w0 = SPH_T64(w0 + SKSI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \ + w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \ + w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_ADDKEY(s, tt0, tt1) do { \ + p0 = SPH_T64(p0 + h[s + 0]); \ + p1 = SPH_T64(p1 + h[s + 1]); \ + p2 = SPH_T64(p2 + h[s + 2]); \ + p3 = SPH_T64(p3 + h[s + 3]); \ + p4 = SPH_T64(p4 + h[s + 4]); \ + p5 = SPH_T64(p5 + h[s + 5] + tt0); \ + p6 = SPH_T64(p6 + h[s + 6] + tt1); \ + p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \ + } while (0) + +#else + +#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \ + w0 = SPH_T64(w0 + SKBI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKBI(k, s, 1)); \ + w2 = SPH_T64(w2 + SKBI(k, s, 2)); \ + w3 = SPH_T64(w3 + SKBI(k, s, 3)); \ + w4 = SPH_T64(w4 + SKBI(k, s, 4)); \ + w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define TFSMALL_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) +#endif + +#define TFBIG_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1) do { \ + TFSMALL_MIX(w0, w1, rc0); \ + TFSMALL_MIX(w2, w3, rc1); \ + } while (0) +#endif + +#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ + TFBIG_MIX(w0, w1, rc0); \ + TFBIG_MIX(w2, w3, rc1); \ + TFBIG_MIX(w4, w5, rc2); \ + TFBIG_MIX(w6, w7, rc3); \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_4e(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \ + TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \ + TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \ + TFSMALL_MIX4(p0, p3, p2, p1, 5, 37); \ + } while (0) + +#define TFSMALL_4o(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \ + TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \ + TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \ + TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(s, t0, t1); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(s, t1, t2); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#else + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define UBI_SMALL(etype, extra) do { \ + sph_u64 h4, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le(buf + 0); \ + sph_u64 m1 = sph_dec64le(buf + 8); \ + sph_u64 m2 = sph_dec64le(buf + 16); \ + sph_u64 m3 = sph_dec64le(buf + 24); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \ + t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \ + TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \ + TFSMALL_4e(0); \ + TFSMALL_4o(1); \ + TFSMALL_4e(2); \ + TFSMALL_4o(3); \ + TFSMALL_4e(4); \ + TFSMALL_4o(5); \ + TFSMALL_4e(6); \ + TFSMALL_4o(7); \ + TFSMALL_4e(8); \ + TFSMALL_4o(9); \ + TFSMALL_4e(10); \ + TFSMALL_4o(11); \ + TFSMALL_4e(12); \ + TFSMALL_4o(13); \ + TFSMALL_4e(14); \ + TFSMALL_4o(15); \ + TFSMALL_4e(16); \ + TFSMALL_4o(17); \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define UBI_BIG(etype, extra) do { \ + sph_u64 t0, t1, t2; \ + unsigned u; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \ + h[6], h[7], h[8], t0, t1, t2); \ + for (u = 0; u <= 15; u += 3) { \ + h[u + 9] = h[u + 0]; \ + h[u + 10] = h[u + 1]; \ + h[u + 11] = h[u + 2]; \ + } \ + for (u = 0; u < 9; u ++) { \ + sph_u64 s = u << 1; \ + sph_u64 tmp; \ + TFBIG_4e(s); \ + TFBIG_4o(s + 1); \ + tmp = t2; \ + t2 = t1; \ + t1 = t0; \ + t0 = tmp; \ + } \ + TFBIG_ADDKEY(18, t0, t1); \ + h[0] = m0 ^ p0; \ + h[1] = m1 ^ p1; \ + h[2] = m2 ^ p2; \ + h[3] = m3 ^ p3; \ + h[4] = m4 ^ p4; \ + h[5] = m5 ^ p5; \ + h[6] = m6 ^ p6; \ + h[7] = m7 ^ p7; \ + } while (0) + +#else + +#define UBI_BIG(etype, extra) do { \ + sph_u64 h8, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ + TFBIG_4e(0); \ + TFBIG_4o(1); \ + TFBIG_4e(2); \ + TFBIG_4o(3); \ + TFBIG_4e(4); \ + TFBIG_4o(5); \ + TFBIG_4e(6); \ + TFBIG_4o(7); \ + TFBIG_4e(8); \ + TFBIG_4o(9); \ + TFBIG_4e(10); \ + TFBIG_4o(11); \ + TFBIG_4e(12); \ + TFBIG_4o(13); \ + TFBIG_4e(14); \ + TFBIG_4o(15); \ + TFBIG_4e(16); \ + TFBIG_4o(17); \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + h4 = m4 ^ p4; \ + h5 = m5 ^ p5; \ + h6 = m6 ^ p6; \ + h7 = m7 ^ p7; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define DECL_STATE_SMALL \ + sph_u64 h0, h1, h2, h3; \ + sph_u64 bcount; + +#define READ_STATE_SMALL(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + sc->bcount = bcount; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define DECL_STATE_BIG \ + sph_u64 h[27]; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h[0] = (sc)->h0; \ + h[1] = (sc)->h1; \ + h[2] = (sc)->h2; \ + h[3] = (sc)->h3; \ + h[4] = (sc)->h4; \ + h[5] = (sc)->h5; \ + h[6] = (sc)->h6; \ + h[7] = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h[0]; \ + (sc)->h1 = h[1]; \ + (sc)->h2 = h[2]; \ + (sc)->h3 = h[3]; \ + (sc)->h4 = h[4]; \ + (sc)->h5 = h[5]; \ + (sc)->h6 = h[6]; \ + (sc)->h7 = h[7]; \ + sc->bcount = bcount; \ + } while (0) + +#else + +#define DECL_STATE_BIG \ + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + h4 = (sc)->h4; \ + h5 = (sc)->h5; \ + h6 = (sc)->h6; \ + h7 = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + (sc)->h4 = h4; \ + (sc)->h5 = h5; \ + (sc)->h6 = h6; \ + (sc)->h7 = h7; \ + sc->bcount = bcount; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +static void +skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->bcount = 0; + sc->ptr = 0; +} +#endif + +static void +skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->h4 = iv[4]; + sc->h5 = iv[5]; + sc->h6 = iv[6]; + sc->h7 = iv[7]; + sc->bcount = 0; + sc->ptr = 0; +} + +#if 0 +/* obsolete */ +static void +skein_small_core(sph_skein_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr, clen; + unsigned first; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + clen = (sizeof sc->buf) - ptr; + if (len <= clen) { + memcpy(buf + ptr, data, len); + sc->ptr = ptr + len; + return; + } + if (clen != 0) { + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + first = 0; + memcpy(buf, data, sizeof sc->buf); + data = (const unsigned char *)data + sizeof sc->buf; + len -= sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(buf, data, len); + +#else + + /* + * Unrolling the loop yields a slight performance boost, while + * keeping the code size aorund 24 kB on 32-bit x86. + */ + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + buf = (unsigned char *)data; + bcount ++; + UBI_SMALL(96, 0); + if (len <= 2 * sizeof sc->buf) { + data = buf + sizeof sc->buf; + len -= sizeof sc->buf; + break; + } + buf += sizeof sc->buf; + data = buf + sizeof sc->buf; + first = 0; + len -= 2 * sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(sc->buf, data, len); + +#endif +} +#endif + +static void +skein_big_core(sph_skein_big_context *sc, const void *data, size_t len) +{ + /* + * The Skein "final bit" in the tweak is troublesome here, + * because if the input has a length which is a multiple of the + * block size (512 bits) then that bit must be set for the + * final block, which is full of message bits (padding in + * Skein can be reduced to no extra bit at all). However, this + * function cannot know whether it processes the last chunks of + * the message or not. Hence we may keep a full block of buffered + * data (64 bytes). + */ + unsigned char *buf; + size_t ptr; + unsigned first; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len <= (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + first = (bcount == 0) << 7; + do { + size_t clen; + + if (ptr == sizeof sc->buf) { + bcount ++; + UBI_BIG(96 + first, 0); + first = 0; + ptr = 0; + } + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + } while (len > 0); + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +#if 0 +/* obsolete */ +static void +skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; + DECL_STATE_SMALL + + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_small_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + READ_STATE_SMALL(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_SMALL(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + memcpy(dst, buf, out_len); +} +#endif + +static void +skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; +#if SPH_SMALL_FOOTPRINT_SKEIN + size_t u; +#endif + DECL_STATE_BIG + + /* + * Add bit padding if necessary. + */ + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_big_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + + /* + * At that point, if ptr == 0, then the message was empty; + * otherwise, there is between 1 and 64 bytes (inclusive) which + * are yet to be processed. Either way, we complete the buffer + * to a full block with zeros (the Skein specification mandates + * that an empty message is padded so that there is at least + * one block to process). + * + * Once this block has been processed, we do it again, with + * a block full of zeros, for the output (that block contains + * the encoding of "0", over 8 bytes, then padded with zeros). + */ + READ_STATE_BIG(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_BIG(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + /* + * We use a temporary buffer because we must support the case + * where output size is not a multiple of 64 (namely, a 224-bit + * output). + */ + for (u = 0; u < out_len; u += 8) + sph_enc64le_aligned(buf + u, h[u >> 3]); + memcpy(dst, buf, out_len); + +#else + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + sph_enc64le_aligned(buf + 32, h4); + sph_enc64le_aligned(buf + 40, h5); + sph_enc64le_aligned(buf + 48, h6); + sph_enc64le_aligned(buf + 56, h7); + memcpy(dst, buf, out_len); + +#endif +} + +#if 0 +/* obsolete */ +static const sph_u64 IV224[] = { + SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C), + SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833), + SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69) +}; +#endif + +static const sph_u64 IV224[] = { + SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF), + SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4), + SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D), + SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), + SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), + SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), + SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +}; + +static const sph_u64 IV384[] = { + SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4), + SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA), + SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407), + SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; + +#if 0 +/* obsolete */ +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_small_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_small_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} +#endif + +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_big_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_big_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein384_init(void *cc) +{ + skein_big_init(cc, IV384); +} + +/* see sph_skein.h */ +void +sph_skein384(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein384_close(void *cc, void *dst) +{ + sph_skein384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 48); + sph_skein384_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein512_init(void *cc) +{ + skein_big_init(cc, IV512); +} + +/* see sph_skein.h */ +void +sph_skein512(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein512_close(void *cc, void *dst) +{ + sph_skein512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 64); + sph_skein512_init(cc); +} + +#endif diff --git a/sph_blake.h b/sph/sph_blake.h similarity index 100% rename from sph_blake.h rename to sph/sph_blake.h diff --git a/sph/sph_bmw.h b/sph/sph_bmw.h new file mode 100644 index 0000000..484a2a7 --- /dev/null +++ b/sph/sph_bmw.h @@ -0,0 +1,320 @@ +/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * BMW interface. BMW (aka "Blue Midnight Wish") is a family of + * functions which differ by their output size; this implementation + * defines BMW for output sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_bmw.h + * @author Thomas Pornin + */ + +#ifndef SPH_BMW_H__ +#define SPH_BMW_H__ + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BMW-224. + */ +#define SPH_SIZE_bmw224 224 + +/** + * Output size (in bits) for BMW-256. + */ +#define SPH_SIZE_bmw256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BMW-384. + */ +#define SPH_SIZE_bmw384 384 + +/** + * Output size (in bits) for BMW-512. + */ +#define SPH_SIZE_bmw512 512 + +#endif + +/** + * This structure is a context for BMW-224 and BMW-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[16]; +#if SPH_64 + sph_u64 bit_count; +#else + sph_u32 bit_count_high, bit_count_low; +#endif +#endif +} sph_bmw_small_context; + +/** + * This structure is a context for BMW-224 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw224_context; + +/** + * This structure is a context for BMW-256 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw256_context; + +#if SPH_64 + +/** + * This structure is a context for BMW-384 and BMW-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[16]; + sph_u64 bit_count; +#endif +} sph_bmw_big_context; + +/** + * This structure is a context for BMW-384 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw384_context; + +/** + * This structure is a context for BMW-512 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw512_context; + +#endif + +/** + * Initialize a BMW-224 context. This process performs no memory allocation. + * + * @param cc the BMW-224 context (pointer to a + * sph_bmw224_context) + */ +void sph_bmw224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-224 context + * @param dst the destination buffer + */ +void sph_bmw224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-256 context. This process performs no memory allocation. + * + * @param cc the BMW-256 context (pointer to a + * sph_bmw256_context) + */ +void sph_bmw256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-256 context + * @param dst the destination buffer + */ +void sph_bmw256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BMW-384 context. This process performs no memory allocation. + * + * @param cc the BMW-384 context (pointer to a + * sph_bmw384_context) + */ +void sph_bmw384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-384 context + * @param dst the destination buffer + */ +void sph_bmw384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-512 context. This process performs no memory allocation. + * + * @param cc the BMW-512 context (pointer to a + * sph_bmw512_context) + */ +void sph_bmw512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-512 context + * @param dst the destination buffer + */ +void sph_bmw512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#endif diff --git a/sph/sph_cubehash.h b/sph/sph_cubehash.h new file mode 100644 index 0000000..487a194 --- /dev/null +++ b/sph/sph_cubehash.h @@ -0,0 +1,292 @@ +/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */ +/** + * CubeHash interface. CubeHash is a family of functions which differ by + * their output size; this implementation defines CubeHash for output + * sizes 224, 256, 384 and 512 bits, with the "standard parameters" + * (CubeHash16/32 with the CubeHash specification notations). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_cubehash.h + * @author Thomas Pornin + */ + +#ifndef SPH_CUBEHASH_H__ +#define SPH_CUBEHASH_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for CubeHash-224. + */ +#define SPH_SIZE_cubehash224 224 + +/** + * Output size (in bits) for CubeHash-256. + */ +#define SPH_SIZE_cubehash256 256 + +/** + * Output size (in bits) for CubeHash-384. + */ +#define SPH_SIZE_cubehash384 384 + +/** + * Output size (in bits) for CubeHash-512. + */ +#define SPH_SIZE_cubehash512 512 + +/** + * This structure is a context for CubeHash computations: it contains the + * intermediate values and some data from the last entered block. Once + * a CubeHash computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running CubeHash computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; +#endif +} sph_cubehash_context; + +/** + * Type for a CubeHash-224 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash224_context; + +/** + * Type for a CubeHash-256 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash256_context; + +/** + * Type for a CubeHash-384 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash384_context; + +/** + * Type for a CubeHash-512 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash512_context; + +/** + * Initialize a CubeHash-224 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-224 context (pointer to a + * sph_cubehash224_context) + */ +void sph_cubehash224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash224(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-224 context + * @param dst the destination buffer + */ +void sph_cubehash224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-256 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-256 context (pointer to a + * sph_cubehash256_context) + */ +void sph_cubehash256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash256(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-256 context + * @param dst the destination buffer + */ +void sph_cubehash256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-384 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-384 context (pointer to a + * sph_cubehash384_context) + */ +void sph_cubehash384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash384(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-384 context + * @param dst the destination buffer + */ +void sph_cubehash384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-512 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-512 context (pointer to a + * sph_cubehash512_context) + */ +void sph_cubehash512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash512(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-512 context + * @param dst the destination buffer + */ +void sph_cubehash512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_echo.h b/sph/sph_echo.h new file mode 100644 index 0000000..1ae1e3d --- /dev/null +++ b/sph/sph_echo.h @@ -0,0 +1,320 @@ +/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * ECHO interface. ECHO is a family of functions which differ by + * their output size; this implementation defines ECHO for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_echo.h + * @author Thomas Pornin + */ + +#ifndef SPH_ECHO_H__ +#define SPH_ECHO_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for ECHO-224. + */ +#define SPH_SIZE_echo224 224 + +/** + * Output size (in bits) for ECHO-256. + */ +#define SPH_SIZE_echo256 256 + +/** + * Output size (in bits) for ECHO-384. + */ +#define SPH_SIZE_echo384 384 + +/** + * Output size (in bits) for ECHO-512. + */ +#define SPH_SIZE_echo512 512 + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-224 + * and ECHO-256. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[192]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[4][4]; +#if SPH_64 + sph_u64 Vb[4][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_small_context; + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-384 + * and ECHO-512. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[8][4]; +#if SPH_64 + sph_u64 Vb[8][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_big_context; + +/** + * Type for a ECHO-224 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo224_context; + +/** + * Type for a ECHO-256 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo256_context; + +/** + * Type for a ECHO-384 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo384_context; + +/** + * Type for a ECHO-512 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo512_context; + +/** + * Initialize an ECHO-224 context. This process performs no memory allocation. + * + * @param cc the ECHO-224 context (pointer to a + * sph_echo224_context) + */ +void sph_echo224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo224(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-224 context + * @param dst the destination buffer + */ +void sph_echo224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-256 context. This process performs no memory allocation. + * + * @param cc the ECHO-256 context (pointer to a + * sph_echo256_context) + */ +void sph_echo256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo256(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-256 context + * @param dst the destination buffer + */ +void sph_echo256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-384 context. This process performs no memory allocation. + * + * @param cc the ECHO-384 context (pointer to a + * sph_echo384_context) + */ +void sph_echo384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo384(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-384 context + * @param dst the destination buffer + */ +void sph_echo384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-512 context. This process performs no memory allocation. + * + * @param cc the ECHO-512 context (pointer to a + * sph_echo512_context) + */ +void sph_echo512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo512(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-512 context + * @param dst the destination buffer + */ +void sph_echo512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph_fugue.h b/sph/sph_fugue.h similarity index 100% rename from sph_fugue.h rename to sph/sph_fugue.h diff --git a/sph_groestl.h b/sph/sph_groestl.h similarity index 100% rename from sph_groestl.h rename to sph/sph_groestl.h diff --git a/sph/sph_jh.h b/sph/sph_jh.h new file mode 100644 index 0000000..0268406 --- /dev/null +++ b/sph/sph_jh.h @@ -0,0 +1,290 @@ +/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * JH interface. JH is a family of functions which differ by + * their output size; this implementation defines JH for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_jh.h + * @author Thomas Pornin + */ + +#ifndef SPH_JH_H__ +#define SPH_JH_H__ + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for JH-224. + */ +#define SPH_SIZE_jh224 224 + +/** + * Output size (in bits) for JH-256. + */ +#define SPH_SIZE_jh256 256 + +/** + * Output size (in bits) for JH-384. + */ +#define SPH_SIZE_jh384 384 + +/** + * Output size (in bits) for JH-512. + */ +#define SPH_SIZE_jh512 512 + +/** + * This structure is a context for JH computations: it contains the + * intermediate values and some data from the last entered block. Once + * a JH computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running JH computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } H; +#if SPH_64 + sph_u64 block_count; +#else + sph_u32 block_count_high, block_count_low; +#endif +#endif +} sph_jh_context; + +/** + * Type for a JH-224 context (identical to the common context). + */ +typedef sph_jh_context sph_jh224_context; + +/** + * Type for a JH-256 context (identical to the common context). + */ +typedef sph_jh_context sph_jh256_context; + +/** + * Type for a JH-384 context (identical to the common context). + */ +typedef sph_jh_context sph_jh384_context; + +/** + * Type for a JH-512 context (identical to the common context). + */ +typedef sph_jh_context sph_jh512_context; + +/** + * Initialize a JH-224 context. This process performs no memory allocation. + * + * @param cc the JH-224 context (pointer to a + * sph_jh224_context) + */ +void sph_jh224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh224(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-224 context + * @param dst the destination buffer + */ +void sph_jh224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-256 context. This process performs no memory allocation. + * + * @param cc the JH-256 context (pointer to a + * sph_jh256_context) + */ +void sph_jh256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh256(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-256 context + * @param dst the destination buffer + */ +void sph_jh256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-384 context. This process performs no memory allocation. + * + * @param cc the JH-384 context (pointer to a + * sph_jh384_context) + */ +void sph_jh384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh384(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-384 context + * @param dst the destination buffer + */ +void sph_jh384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-512 context. This process performs no memory allocation. + * + * @param cc the JH-512 context (pointer to a + * sph_jh512_context) + */ +void sph_jh512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh512(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-512 context + * @param dst the destination buffer + */ +void sph_jh512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif diff --git a/sph_keccak.h b/sph/sph_keccak.h similarity index 100% rename from sph_keccak.h rename to sph/sph_keccak.h diff --git a/sph/sph_luffa.h b/sph/sph_luffa.h new file mode 100644 index 0000000..a32fd7b --- /dev/null +++ b/sph/sph_luffa.h @@ -0,0 +1,296 @@ +/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * Luffa interface. Luffa is a family of functions which differ by + * their output size; this implementation defines Luffa for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_luffa.h + * @author Thomas Pornin + */ + +#ifndef SPH_LUFFA_H__ +#define SPH_LUFFA_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Luffa-224. + */ +#define SPH_SIZE_luffa224 224 + +/** + * Output size (in bits) for Luffa-256. + */ +#define SPH_SIZE_luffa256 256 + +/** + * Output size (in bits) for Luffa-384. + */ +#define SPH_SIZE_luffa384 384 + +/** + * Output size (in bits) for Luffa-512. + */ +#define SPH_SIZE_luffa512 512 + +/** + * This structure is a context for Luffa-224 computations: it contains + * the intermediate values and some data from the last entered block. + * Once a Luffa computation has been performed, the context can be + * reused for another computation. + * + * The contents of this structure are private. A running Luffa + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[3][8]; +#endif +} sph_luffa224_context; + +/** + * This structure is a context for Luffa-256 computations. It is + * identical to sph_luffa224_context. + */ +typedef sph_luffa224_context sph_luffa256_context; + +/** + * This structure is a context for Luffa-384 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[4][8]; +#endif +} sph_luffa384_context; + +/** + * This structure is a context for Luffa-512 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[5][8]; +#endif +} sph_luffa512_context; + +/** + * Initialize a Luffa-224 context. This process performs no memory allocation. + * + * @param cc the Luffa-224 context (pointer to a + * sph_luffa224_context) + */ +void sph_luffa224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-224 context + * @param dst the destination buffer + */ +void sph_luffa224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-256 context. This process performs no memory allocation. + * + * @param cc the Luffa-256 context (pointer to a + * sph_luffa256_context) + */ +void sph_luffa256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-256 context + * @param dst the destination buffer + */ +void sph_luffa256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-384 context. This process performs no memory allocation. + * + * @param cc the Luffa-384 context (pointer to a + * sph_luffa384_context) + */ +void sph_luffa384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-384 context + * @param dst the destination buffer + */ +void sph_luffa384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-512 context. This process performs no memory allocation. + * + * @param cc the Luffa-512 context (pointer to a + * sph_luffa512_context) + */ +void sph_luffa512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-512 context + * @param dst the destination buffer + */ +void sph_luffa512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_shavite.h b/sph/sph_shavite.h new file mode 100644 index 0000000..0957e42 --- /dev/null +++ b/sph/sph_shavite.h @@ -0,0 +1,314 @@ +/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */ +/** + * SHAvite-3 interface. This code implements SHAvite-3 with the + * recommended parameters for SHA-3, with outputs of 224, 256, 384 and + * 512 bits. In the following, we call the function "SHAvite" (without + * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit + * output". + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_shavite.h + * @author Thomas Pornin + */ + +#ifndef SPH_SHAVITE_H__ +#define SPH_SHAVITE_H__ + +#include +#include "sph_types.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/** + * Output size (in bits) for SHAvite-224. + */ +#define SPH_SIZE_shavite224 224 + +/** + * Output size (in bits) for SHAvite-256. + */ +#define SPH_SIZE_shavite256 256 + +/** + * Output size (in bits) for SHAvite-384. + */ +#define SPH_SIZE_shavite384 384 + +/** + * Output size (in bits) for SHAvite-512. + */ +#define SPH_SIZE_shavite512 512 + +/** + * This structure is a context for SHAvite-224 and SHAvite-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[8]; + sph_u32 count0, count1; +#endif +} sph_shavite_small_context; + +/** + * This structure is a context for SHAvite-224 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite224_context; + +/** + * This structure is a context for SHAvite-256 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite256_context; + +/** + * This structure is a context for SHAvite-384 and SHAvite-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[16]; + sph_u32 count0, count1, count2, count3; +#endif +} sph_shavite_big_context; + +/** + * This structure is a context for SHAvite-384 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite384_context; + +/** + * This structure is a context for SHAvite-512 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite512_context; + +/** + * Initialize a SHAvite-224 context. This process performs no memory allocation. + * + * @param cc the SHAvite-224 context (pointer to a + * sph_shavite224_context) + */ +void sph_shavite224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-224 context + * @param dst the destination buffer + */ +void sph_shavite224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-256 context. This process performs no memory allocation. + * + * @param cc the SHAvite-256 context (pointer to a + * sph_shavite256_context) + */ +void sph_shavite256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-256 context + * @param dst the destination buffer + */ +void sph_shavite256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-384 context. This process performs no memory allocation. + * + * @param cc the SHAvite-384 context (pointer to a + * sph_shavite384_context) + */ +void sph_shavite384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-384 context + * @param dst the destination buffer + */ +void sph_shavite384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-512 context. This process performs no memory allocation. + * + * @param cc the SHAvite-512 context (pointer to a + * sph_shavite512_context) + */ +void sph_shavite512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-512 context + * @param dst the destination buffer + */ +void sph_shavite512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_simd.h b/sph/sph_simd.h new file mode 100644 index 0000000..92ee1e7 --- /dev/null +++ b/sph/sph_simd.h @@ -0,0 +1,309 @@ +/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * SIMD interface. SIMD is a family of functions which differ by + * their output size; this implementation defines SIMD for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_simd.h + * @author Thomas Pornin + */ + +#ifndef SPH_SIMD_H__ +#define SPH_SIMD_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for SIMD-224. + */ +#define SPH_SIZE_simd224 224 + +/** + * Output size (in bits) for SIMD-256. + */ +#define SPH_SIZE_simd256 256 + +/** + * Output size (in bits) for SIMD-384. + */ +#define SPH_SIZE_simd384 384 + +/** + * Output size (in bits) for SIMD-512. + */ +#define SPH_SIZE_simd512 512 + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-224 + * and SIMD-256. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[16]; + sph_u32 count_low, count_high; +#endif +} sph_simd_small_context; + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-384 + * and SIMD-512. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; + sph_u32 count_low, count_high; +#endif +} sph_simd_big_context; + +/** + * Type for a SIMD-224 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd224_context; + +/** + * Type for a SIMD-256 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd256_context; + +/** + * Type for a SIMD-384 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd384_context; + +/** + * Type for a SIMD-512 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd512_context; + +/** + * Initialize an SIMD-224 context. This process performs no memory allocation. + * + * @param cc the SIMD-224 context (pointer to a + * sph_simd224_context) + */ +void sph_simd224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-224 context + * @param dst the destination buffer + */ +void sph_simd224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-256 context. This process performs no memory allocation. + * + * @param cc the SIMD-256 context (pointer to a + * sph_simd256_context) + */ +void sph_simd256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-256 context + * @param dst the destination buffer + */ +void sph_simd256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-384 context. This process performs no memory allocation. + * + * @param cc the SIMD-384 context (pointer to a + * sph_simd384_context) + */ +void sph_simd384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-384 context + * @param dst the destination buffer + */ +void sph_simd384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-512 context. This process performs no memory allocation. + * + * @param cc the SIMD-512 context (pointer to a + * sph_simd512_context) + */ +void sph_simd512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-512 context + * @param dst the destination buffer + */ +void sph_simd512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_skein.h b/sph/sph_skein.h new file mode 100644 index 0000000..8555984 --- /dev/null +++ b/sph/sph_skein.h @@ -0,0 +1,290 @@ +/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ +/** + * Skein interface. The Skein specification defines three main + * functions, called Skein-256, Skein-512 and Skein-1024, which can be + * further parameterized with an output length. For the SHA-3 + * competition, Skein-512 is used for output sizes of 224, 256, 384 and + * 512 bits; this is what this code implements. Thus, we hereafter call + * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein + * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 + * and Skein-512-512, respectively. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_skein.h + * @author Thomas Pornin + */ + +#ifndef SPH_SKEIN_H__ +#define SPH_SKEIN_H__ + +#include +#include "sph_types.h" + +#if SPH_64 + +/** + * Output size (in bits) for Skein-224. + */ +#define SPH_SIZE_skein224 224 + +/** + * Output size (in bits) for Skein-256. + */ +#define SPH_SIZE_skein256 256 + +/** + * Output size (in bits) for Skein-384. + */ +#define SPH_SIZE_skein384 384 + +/** + * Output size (in bits) for Skein-512. + */ +#define SPH_SIZE_skein512 512 + +/** + * This structure is a context for Skein computations (with a 384- or + * 512-bit output): it contains the intermediate values and some data + * from the last entered block. Once a Skein computation has been + * performed, the context can be reused for another computation. + * + * The contents of this structure are private. A running Skein computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; + sph_u64 bcount; +#endif +} sph_skein_big_context; + +/** + * Type for a Skein-224 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein224_context; + +/** + * Type for a Skein-256 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein256_context; + +/** + * Type for a Skein-384 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein384_context; + +/** + * Type for a Skein-512 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein512_context; + +/** + * Initialize a Skein-224 context. This process performs no memory allocation. + * + * @param cc the Skein-224 context (pointer to a + * sph_skein224_context) + */ +void sph_skein224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-224 context + * @param dst the destination buffer + */ +void sph_skein224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-256 context. This process performs no memory allocation. + * + * @param cc the Skein-256 context (pointer to a + * sph_skein256_context) + */ +void sph_skein256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-256 context + * @param dst the destination buffer + */ +void sph_skein256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-384 context. This process performs no memory allocation. + * + * @param cc the Skein-384 context (pointer to a + * sph_skein384_context) + */ +void sph_skein384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-384 context + * @param dst the destination buffer + */ +void sph_skein384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-512 context. This process performs no memory allocation. + * + * @param cc the Skein-512 context (pointer to a + * sph_skein512_context) + */ +void sph_skein512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-512 context + * @param dst the destination buffer + */ +void sph_skein512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#endif diff --git a/sph_types.h b/sph/sph_types.h similarity index 100% rename from sph_types.h rename to sph/sph_types.h