diff --git a/JHA/.deps/.dirstamp b/JHA/.deps/.dirstamp
new file mode 100644
index 0000000..e69de29
diff --git a/JHA/.dirstamp b/JHA/.dirstamp
new file mode 100644
index 0000000..e69de29
diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
new file mode 100644
index 0000000..c59e84e
--- /dev/null
+++ b/JHA/cuda_jha_keccak512.cu
@@ -0,0 +1,572 @@
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+// Folgende Definitionen später durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+__constant__ uint64_t c_State[25];
+__constant__ uint32_t c_PaddedMessage[18];
+
+static __device__ uint32_t cuda_swab32(uint32_t x)
+{
+	return __byte_perm(x, 0, 0x0123);
+}
+
+// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
+#if __CUDA_ARCH__ >= 350
+__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
+    uint2 result;
+    if(offset >= 32) {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+    } else {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+    }
+    return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#else
+#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+#define U32TO64_LE(p) \
+    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+
+#define U64TO32_LE(p, v) \
+    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+
+static const uint64_t host_keccak_round_constants[24] = {
+    0x0000000000000001ull, 0x0000000000008082ull,
+    0x800000000000808aull, 0x8000000080008000ull,
+    0x000000000000808bull, 0x0000000080000001ull,
+    0x8000000080008081ull, 0x8000000000008009ull,
+    0x000000000000008aull, 0x0000000000000088ull,
+    0x0000000080008009ull, 0x000000008000000aull,
+    0x000000008000808bull, 0x800000000000008bull,
+    0x8000000000008089ull, 0x8000000000008003ull,
+    0x8000000000008002ull, 0x8000000000000080ull,
+    0x000000000000800aull, 0x800000008000000aull,
+    0x8000000080008081ull, 0x8000000000008080ull,
+    0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint64_t c_keccak_round_constants[24];
+
+static __device__ __forceinline__ void
+keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
+    size_t i;
+    uint64_t t[5], u[5], v, w;
+
+    /* absorb input */
+#pragma unroll 9
+    for (i = 0; i < 72 / 8; i++, in += 2)
+        s[i] ^= U32TO64_LE(in);
+    
+    for (i = 0; i < 24; i++) {
+        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+        u[0] = t[4] ^ ROTL64(t[1], 1);
+        u[1] = t[0] ^ ROTL64(t[2], 1);
+        u[2] = t[1] ^ ROTL64(t[3], 1);
+        u[3] = t[2] ^ ROTL64(t[4], 1);
+        u[4] = t[3] ^ ROTL64(t[0], 1);
+
+        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+        /* rho pi: b[..] = rotl(a[..], ..) */
+        v = s[ 1];
+        s[ 1] = ROTL64(s[ 6], 44);
+        s[ 6] = ROTL64(s[ 9], 20);
+        s[ 9] = ROTL64(s[22], 61);
+        s[22] = ROTL64(s[14], 39);
+        s[14] = ROTL64(s[20], 18);
+        s[20] = ROTL64(s[ 2], 62);
+        s[ 2] = ROTL64(s[12], 43);
+        s[12] = ROTL64(s[13], 25);
+        s[13] = ROTL64(s[19],  8);
+        s[19] = ROTL64(s[23], 56);
+        s[23] = ROTL64(s[15], 41);
+        s[15] = ROTL64(s[ 4], 27);
+        s[ 4] = ROTL64(s[24], 14);
+        s[24] = ROTL64(s[21],  2);
+        s[21] = ROTL64(s[ 8], 55);
+        s[ 8] = ROTL64(s[16], 45);
+        s[16] = ROTL64(s[ 5], 36);
+        s[ 5] = ROTL64(s[ 3], 28);
+        s[ 3] = ROTL64(s[18], 21);
+        s[18] = ROTL64(s[17], 15);
+        s[17] = ROTL64(s[11], 10);
+        s[11] = ROTL64(s[ 7],  6);
+        s[ 7] = ROTL64(s[10],  3);
+        s[10] = ROTL64(    v,  1);
+
+        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+        /* iota: a[0,0] ^= round constant */
+        s[0] ^= keccak_round_constants[i];
+    }
+}
+
+__global__ void jackpot_keccak512_gpu_hash_88(int threads, uint32_t startNounce, uint64_t *g_hash)
+{
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (thread < threads)
+    {
+        uint32_t nounce = startNounce + thread;
+
+        int hashPosition = nounce - startNounce;
+
+        // Nachricht kopieren
+        uint32_t message[18];
+#pragma unroll 18
+        for(int i=0;i<18;i++)
+            message[i] = c_PaddedMessage[i];
+
+        // die individuelle Nounce einsetzen
+        message[1] = cuda_swab32(nounce);
+
+        // State initialisieren
+        uint64_t keccak_gpu_state[25];
+#pragma unroll 25
+        for (int i=0; i<25; i++)
+            keccak_gpu_state[i] = c_State[i];
+
+        // den Block einmal gut durchschütteln
+        keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+        // das Hash erzeugen
+        uint32_t hash[16];
+
+#pragma unroll 8
+        for (size_t i = 0; i < 64; i += 8) {
+            U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
+        }
+
+        // fertig
+        uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
+
+#pragma unroll 16
+        for(int i=0;i<16;i++)
+            outpHash[i] = hash[i];
+    }
+}
+
+// Setup-Funktionen
+__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads)
+{
+    // Kopiere die Hash-Tabellen in den GPU-Speicher
+    cudaMemcpyToSymbol( c_keccak_round_constants,
+                        host_keccak_round_constants,
+                        sizeof(host_keccak_round_constants),
+                        0, cudaMemcpyHostToDevice);
+}
+
+#define cKeccakB    1600
+#define cKeccakR    576
+
+#define cKeccakR_SizeInBytes    (cKeccakR / 8)
+#define crypto_hash_BYTES 64
+
+#if     (cKeccakB   == 1600)
+    typedef unsigned long long  UINT64;
+    typedef UINT64 tKeccakLane;
+    #define cKeccakNumberOfRounds   24
+#endif
+
+#define cKeccakLaneSizeInBits   (sizeof(tKeccakLane) * 8)
+
+#define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits))))
+#if ((cKeccakB/25) == 8)
+    #define ROL_mult8(a, offset) ((tKeccakLane)a)
+#else
+    #define ROL_mult8(a, offset) ROL(a, offset)
+#endif
+void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount );
+
+const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = 
+{
+    (tKeccakLane)0x0000000000000001ULL,
+    (tKeccakLane)0x0000000000008082ULL,
+    (tKeccakLane)0x800000000000808aULL,
+    (tKeccakLane)0x8000000080008000ULL,
+    (tKeccakLane)0x000000000000808bULL,
+    (tKeccakLane)0x0000000080000001ULL,
+    (tKeccakLane)0x8000000080008081ULL,
+    (tKeccakLane)0x8000000000008009ULL,
+    (tKeccakLane)0x000000000000008aULL,
+    (tKeccakLane)0x0000000000000088ULL,
+    (tKeccakLane)0x0000000080008009ULL,
+    (tKeccakLane)0x000000008000000aULL,
+    (tKeccakLane)0x000000008000808bULL,
+    (tKeccakLane)0x800000000000008bULL,
+    (tKeccakLane)0x8000000000008089ULL,
+    (tKeccakLane)0x8000000000008003ULL,
+    (tKeccakLane)0x8000000000008002ULL,
+    (tKeccakLane)0x8000000000000080ULL
+	#if		(cKeccakB	>= 400)
+  , (tKeccakLane)0x000000000000800aULL,
+    (tKeccakLane)0x800000008000000aULL
+	#if		(cKeccakB	>= 800)
+  , (tKeccakLane)0x8000000080008081ULL,
+    (tKeccakLane)0x8000000000008080ULL
+	#if		(cKeccakB	== 1600)
+  , (tKeccakLane)0x0000000080000001ULL,
+    (tKeccakLane)0x8000000080008008ULL
+	#endif
+	#endif
+	#endif
+};
+
+void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
+{
+
+    {
+        while ( --laneCount >= 0 )
+        {
+            state[laneCount] ^= in[laneCount];
+        }
+    }
+
+    {
+        tKeccakLane Aba, Abe, Abi, Abo, Abu;
+        tKeccakLane Aga, Age, Agi, Ago, Agu;
+        tKeccakLane Aka, Ake, Aki, Ako, Aku;
+        tKeccakLane Ama, Ame, Ami, Amo, Amu;
+        tKeccakLane Asa, Ase, Asi, Aso, Asu;
+        tKeccakLane BCa, BCe, BCi, BCo, BCu;
+        tKeccakLane Da, De, Di, Do, Du;
+        tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu;
+        tKeccakLane Ega, Ege, Egi, Ego, Egu;
+        tKeccakLane Eka, Eke, Eki, Eko, Eku;
+        tKeccakLane Ema, Eme, Emi, Emo, Emu;
+        tKeccakLane Esa, Ese, Esi, Eso, Esu;
+        #define    round    laneCount
+
+        //copyFromState(A, state)
+        Aba = state[ 0];
+        Abe = state[ 1];
+        Abi = state[ 2];
+        Abo = state[ 3];
+        Abu = state[ 4];
+        Aga = state[ 5];
+        Age = state[ 6];
+        Agi = state[ 7];
+        Ago = state[ 8];
+        Agu = state[ 9];
+        Aka = state[10];
+        Ake = state[11];
+        Aki = state[12];
+        Ako = state[13];
+        Aku = state[14];
+        Ama = state[15];
+        Ame = state[16];
+        Ami = state[17];
+        Amo = state[18];
+        Amu = state[19];
+        Asa = state[20];
+        Ase = state[21];
+        Asi = state[22];
+        Aso = state[23];
+        Asu = state[24];
+
+        for( round = 0; round < cKeccakNumberOfRounds; round += 2 )
+        {
+            //    prepareTheta
+            BCa = Aba^Aga^Aka^Ama^Asa;
+            BCe = Abe^Age^Ake^Ame^Ase;
+            BCi = Abi^Agi^Aki^Ami^Asi;
+            BCo = Abo^Ago^Ako^Amo^Aso;
+            BCu = Abu^Agu^Aku^Amu^Asu;
+
+            //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+            Da = BCu^ROL(BCe, 1);
+            De = BCa^ROL(BCi, 1);
+            Di = BCe^ROL(BCo, 1);
+            Do = BCi^ROL(BCu, 1);
+            Du = BCo^ROL(BCa, 1);
+
+            Aba ^= Da;
+            BCa = Aba;
+            Age ^= De;
+            BCe = ROL(Age, 44);
+            Aki ^= Di;
+            BCi = ROL(Aki, 43);
+            Amo ^= Do;
+            BCo = ROL(Amo, 21);
+            Asu ^= Du;
+            BCu = ROL(Asu, 14);
+            Eba =   BCa ^((~BCe)&  BCi );
+            Eba ^= (tKeccakLane)KeccakF_RoundConstants[round];
+            Ebe =   BCe ^((~BCi)&  BCo );
+            Ebi =   BCi ^((~BCo)&  BCu );
+            Ebo =   BCo ^((~BCu)&  BCa );
+            Ebu =   BCu ^((~BCa)&  BCe );
+
+            Abo ^= Do;
+            BCa = ROL(Abo, 28);
+            Agu ^= Du;
+            BCe = ROL(Agu, 20);
+            Aka ^= Da;
+            BCi = ROL(Aka,  3);
+            Ame ^= De;
+            BCo = ROL(Ame, 45);
+            Asi ^= Di;
+            BCu = ROL(Asi, 61);
+            Ega =   BCa ^((~BCe)&  BCi );
+            Ege =   BCe ^((~BCi)&  BCo );
+            Egi =   BCi ^((~BCo)&  BCu );
+            Ego =   BCo ^((~BCu)&  BCa );
+            Egu =   BCu ^((~BCa)&  BCe );
+
+            Abe ^= De;
+            BCa = ROL(Abe,  1);
+            Agi ^= Di;
+            BCe = ROL(Agi,  6);
+            Ako ^= Do;
+            BCi = ROL(Ako, 25);
+            Amu ^= Du;
+            BCo = ROL_mult8(Amu,  8);
+            Asa ^= Da;
+            BCu = ROL(Asa, 18);
+            Eka =   BCa ^((~BCe)&  BCi );
+            Eke =   BCe ^((~BCi)&  BCo );
+            Eki =   BCi ^((~BCo)&  BCu );
+            Eko =   BCo ^((~BCu)&  BCa );
+            Eku =   BCu ^((~BCa)&  BCe );
+
+            Abu ^= Du;
+            BCa = ROL(Abu, 27);
+            Aga ^= Da;
+            BCe = ROL(Aga, 36);
+            Ake ^= De;
+            BCi = ROL(Ake, 10);
+            Ami ^= Di;
+            BCo = ROL(Ami, 15);
+            Aso ^= Do;
+            BCu = ROL_mult8(Aso, 56);
+            Ema =   BCa ^((~BCe)&  BCi );
+            Eme =   BCe ^((~BCi)&  BCo );
+            Emi =   BCi ^((~BCo)&  BCu );
+            Emo =   BCo ^((~BCu)&  BCa );
+            Emu =   BCu ^((~BCa)&  BCe );
+
+            Abi ^= Di;
+            BCa = ROL(Abi, 62);
+            Ago ^= Do;
+            BCe = ROL(Ago, 55);
+            Aku ^= Du;
+            BCi = ROL(Aku, 39);
+            Ama ^= Da;
+            BCo = ROL(Ama, 41);
+            Ase ^= De;
+            BCu = ROL(Ase,  2);
+            Esa =   BCa ^((~BCe)&  BCi );
+            Ese =   BCe ^((~BCi)&  BCo );
+            Esi =   BCi ^((~BCo)&  BCu );
+            Eso =   BCo ^((~BCu)&  BCa );
+            Esu =   BCu ^((~BCa)&  BCe );
+
+            //    prepareTheta
+            BCa = Eba^Ega^Eka^Ema^Esa;
+            BCe = Ebe^Ege^Eke^Eme^Ese;
+            BCi = Ebi^Egi^Eki^Emi^Esi;
+            BCo = Ebo^Ego^Eko^Emo^Eso;
+            BCu = Ebu^Egu^Eku^Emu^Esu;
+
+            //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+            Da = BCu^ROL(BCe, 1);
+            De = BCa^ROL(BCi, 1);
+            Di = BCe^ROL(BCo, 1);
+            Do = BCi^ROL(BCu, 1);
+            Du = BCo^ROL(BCa, 1);
+
+            Eba ^= Da;
+            BCa = Eba;
+            Ege ^= De;
+            BCe = ROL(Ege, 44);
+            Eki ^= Di;
+            BCi = ROL(Eki, 43);
+            Emo ^= Do;
+            BCo = ROL(Emo, 21);
+            Esu ^= Du;
+            BCu = ROL(Esu, 14);
+            Aba =   BCa ^((~BCe)&  BCi );
+            Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1];
+            Abe =   BCe ^((~BCi)&  BCo );
+            Abi =   BCi ^((~BCo)&  BCu );
+            Abo =   BCo ^((~BCu)&  BCa );
+            Abu =   BCu ^((~BCa)&  BCe );
+
+            Ebo ^= Do;
+            BCa = ROL(Ebo, 28);
+            Egu ^= Du;
+            BCe = ROL(Egu, 20);
+            Eka ^= Da;
+            BCi = ROL(Eka, 3);
+            Eme ^= De;
+            BCo = ROL(Eme, 45);
+            Esi ^= Di;
+            BCu = ROL(Esi, 61);
+            Aga =   BCa ^((~BCe)&  BCi );
+            Age =   BCe ^((~BCi)&  BCo );
+            Agi =   BCi ^((~BCo)&  BCu );
+            Ago =   BCo ^((~BCu)&  BCa );
+            Agu =   BCu ^((~BCa)&  BCe );
+
+            Ebe ^= De;
+            BCa = ROL(Ebe, 1);
+            Egi ^= Di;
+            BCe = ROL(Egi, 6);
+            Eko ^= Do;
+            BCi = ROL(Eko, 25);
+            Emu ^= Du;
+            BCo = ROL_mult8(Emu, 8);
+            Esa ^= Da;
+            BCu = ROL(Esa, 18);
+            Aka =   BCa ^((~BCe)&  BCi );
+            Ake =   BCe ^((~BCi)&  BCo );
+            Aki =   BCi ^((~BCo)&  BCu );
+            Ako =   BCo ^((~BCu)&  BCa );
+            Aku =   BCu ^((~BCa)&  BCe );
+
+            Ebu ^= Du;
+            BCa = ROL(Ebu, 27);
+            Ega ^= Da;
+            BCe = ROL(Ega, 36);
+            Eke ^= De;
+            BCi = ROL(Eke, 10);
+            Emi ^= Di;
+            BCo = ROL(Emi, 15);
+            Eso ^= Do;
+            BCu = ROL_mult8(Eso, 56);
+            Ama =   BCa ^((~BCe)&  BCi );
+            Ame =   BCe ^((~BCi)&  BCo );
+            Ami =   BCi ^((~BCo)&  BCu );
+            Amo =   BCo ^((~BCu)&  BCa );
+            Amu =   BCu ^((~BCa)&  BCe );
+
+            Ebi ^= Di;
+            BCa = ROL(Ebi, 62);
+            Ego ^= Do;
+            BCe = ROL(Ego, 55);
+            Eku ^= Du;
+            BCi = ROL(Eku, 39);
+            Ema ^= Da;
+            BCo = ROL(Ema, 41);
+            Ese ^= De;
+            BCu = ROL(Ese, 2);
+            Asa =   BCa ^((~BCe)&  BCi );
+            Ase =   BCe ^((~BCi)&  BCo );
+            Asi =   BCi ^((~BCo)&  BCu );
+            Aso =   BCo ^((~BCu)&  BCa );
+            Asu =   BCu ^((~BCa)&  BCe );
+        }
+
+        //copyToState(state, A)
+        state[ 0] = Aba;
+        state[ 1] = Abe;
+        state[ 2] = Abi;
+        state[ 3] = Abo;
+        state[ 4] = Abu;
+        state[ 5] = Aga;
+        state[ 6] = Age;
+        state[ 7] = Agi;
+        state[ 8] = Ago;
+        state[ 9] = Agu;
+        state[10] = Aka;
+        state[11] = Ake;
+        state[12] = Aki;
+        state[13] = Ako;
+        state[14] = Aku;
+        state[15] = Ama;
+        state[16] = Ame;
+        state[17] = Ami;
+        state[18] = Amo;
+        state[19] = Amu;
+        state[20] = Asa;
+        state[21] = Ase;
+        state[22] = Asi;
+        state[23] = Aso;
+        state[24] = Asu;
+
+        #undef    round
+    }
+}
+
+__host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata)
+{
+    unsigned long long inlen = 88;
+    const unsigned char *in = (const unsigned char*)pdata;
+
+    tKeccakLane    state[5 * 5];
+    unsigned char temp[cKeccakR_SizeInBytes];
+
+    memset( state, 0, sizeof(state) );
+
+    for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes )
+    {
+        KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
+    }
+
+    // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten)
+    // ins Constant Memory
+    cudaMemcpyToSymbol( c_State,
+                        state,
+                        sizeof(state),
+                        0, cudaMemcpyHostToDevice);
+
+    //    padding
+    memcpy( temp, in, (size_t)inlen );
+    temp[inlen++] = 1;
+    memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen );
+    temp[cKeccakR_SizeInBytes-1] |= 0x80;
+
+
+    // Kopiere den Rest der Message und das Padding ins Constant Memory
+    cudaMemcpyToSymbol( c_PaddedMessage,
+                        temp,
+                        cKeccakR_SizeInBytes,
+                        0, cudaMemcpyHostToDevice);
+}
+
+__host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
+{
+    const int threadsperblock = 256;
+
+    // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 block(threadsperblock);
+
+    // Größe des dynamischen Shared Memory Bereichs
+    size_t shared_size = 0;
+
+//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+    jackpot_keccak512_gpu_hash_88<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
+    MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
new file mode 100644
index 0000000..0802eb2
--- /dev/null
+++ b/JHA/jackpotcoin.cu
@@ -0,0 +1,173 @@
+
+extern "C"
+{
+#include "sph/sph_keccak.h"
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_skein.h"
+}
+
+#include "miner.h"
+#include <stdint.h>
+
+// aus cpu-miner.c
+extern int device_map[8];
+extern bool opt_benchmark;
+
+// Speicher für Input/Output der verketteten Hashfunktionen
+static uint32_t *d_hash[8];
+
+extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
+extern void jackpot_keccak512_cpu_setBlock_88(void *pdata);
+extern void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+extern void quark_check_cpu_init(int thr_id, int threads);
+extern void quark_check_cpu_setTarget(const void *ptarget);
+extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+// Original jackpothash Funktion aus einem miner Quelltext
+inline unsigned int jackpothash(void *state, const void *input)
+{
+    sph_blake512_context     ctx_blake;
+    sph_groestl512_context   ctx_groestl;
+    sph_jh512_context        ctx_jh;
+    sph_keccak512_context    ctx_keccak;
+    sph_skein512_context     ctx_skein;
+
+    uint32_t hash[16];
+
+    sph_keccak512_init(&ctx_keccak);
+    sph_keccak512 (&ctx_keccak, input, 88);
+    sph_keccak512_close(&ctx_keccak, hash);
+
+    unsigned int round_mask = (
+       (unsigned int)(((unsigned char *)input)[84]) <<  0 |
+       (unsigned int)(((unsigned char *)input)[85]) <<  8 |
+       (unsigned int)(((unsigned char *)input)[86]) << 16 |
+       (unsigned int)(((unsigned char *)input)[87]) << 24 );
+    unsigned int round_max  = hash[0] & round_mask;
+    unsigned int round;
+    for (round = 0; round < round_max; round++) {
+        switch (hash[0] & 3) {
+          case 0:
+               sph_blake512_init(&ctx_blake);
+               sph_blake512 (&ctx_blake, hash, 64);
+               sph_blake512_close(&ctx_blake, hash);
+               break;
+          case 1:
+               sph_groestl512_init(&ctx_groestl);
+               sph_groestl512 (&ctx_groestl, hash, 64);
+               sph_groestl512_close(&ctx_groestl, hash);
+               break;
+          case 2:
+               sph_jh512_init(&ctx_jh);
+               sph_jh512 (&ctx_jh, hash, 64);
+               sph_jh512_close(&ctx_jh, hash);
+               break;
+          case 3:
+               sph_skein512_init(&ctx_skein);
+               sph_skein512 (&ctx_skein, hash, 64);
+               sph_skein512_close(&ctx_skein, hash);
+               break;
+        }
+    }
+    memcpy(state, hash, 32);
+
+    return round_max;
+}
+
+
+static int bit_population(uint32_t n){
+  int c =0;
+  while(n){
+    c += n&1;
+    n = n>>1;
+  }
+  return c;
+}
+
+extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
+    const uint32_t *ptarget, uint32_t max_nonce,
+    unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+
+	// TODO: entfernen für eine Release! Ist nur zum Testen!
+	if (opt_benchmark) {
+		((uint32_t*)ptarget)[7] = 0x00000f;
+		((uint32_t*)pdata)[21] = 0x07000000;  // round_mask von 7 vorgeben
+    }
+
+	const uint32_t Htarg = ptarget[7];
+
+	const int throughput = 256*4096; // 100;
+
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		// Konstanten kopieren, Speicher belegen
+		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+		jackpot_keccak512_cpu_init(thr_id, throughput);
+		quark_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[22];
+	for (int k=0; k < 22; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	unsigned int round_mask = (
+		(unsigned int)(((unsigned char *)endiandata)[84]) <<  0 |
+		(unsigned int)(((unsigned char *)endiandata)[85]) <<  8 |
+		(unsigned int)(((unsigned char *)endiandata)[86]) << 16 |
+		(unsigned int)(((unsigned char *)endiandata)[87]) << 24 );
+
+	// Zählen wie viele Bits in round_mask gesetzt sind
+	int bitcount = bit_population(round_mask);
+
+	jackpot_keccak512_cpu_setBlock_88((void*)endiandata);
+	quark_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// erstes Blake512 Hash mit CUDA
+		jackpot_keccak512_cpu_hash_88(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		// TODO: hier fehlen jetzt natürlich noch die anderen Hashrunden.
+		// bei round_mask=7 haben wir eine 1:8 Chance, dass das Hash dennoch
+		// die Kriterien erfüllt wenn hash[0] & round_mask  zufällig 0 ist.
+
+		// Scan nach Gewinner Hashes auf der GPU
+		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if  (foundNonce != 0xffffffff)
+		{
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+
+			// diese jackpothash Funktion gibt die Zahl der zusätzlichen Runden zurück
+			unsigned int rounds = jackpothash(vhash64, endiandata);
+
+			// wir akzeptieren nur solche Hashes wo ausschliesslich Keccak verwendet wurde
+			if (rounds == 0) {
+				if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+
+					pdata[19] = foundNonce;
+					*hashes_done = (foundNonce - first_nonce + 1) / (1 << bitcount);
+					return 1;
+				} else {
+					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
+				}
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = (pdata[19] - first_nonce + 1) / (1 << bitcount);
+	return 0;
+}
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
new file mode 100644
index 0000000..fb85a24
--- /dev/null
+++ b/cuda_myriadgroestl.cu
@@ -0,0 +1,622 @@
+// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
+
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+// it's unfortunate that this is a compile time constant.
+#define MAXWELL_OR_FERMI 0
+
+// aus cpu-miner.c
+extern int device_map[8];
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// Folgende Definitionen später durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+// diese Struktur wird in der Init Funktion angefordert
+static cudaDeviceProp props;
+
+// globaler Speicher für alle HeftyHashes aller Threads
+__constant__ uint32_t pTarget[8]; // Single GPU
+extern uint32_t *d_resultNonce[8];
+
+__constant__ uint32_t myriadgroestl_gpu_msg[32];
+
+// muss expandiert werden
+__constant__ uint32_t myr_sha256_gpu_constantTable[64];
+__constant__ uint32_t myr_sha256_gpu_hashTable[8];
+
+uint32_t myr_sha256_cpu_hashTable[] = { 
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+uint32_t myr_sha256_cpu_constantTable[] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#if __CUDA_ARCH__ < 350 
+    // Kepler (Compute 3.0)
+    #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+    // Kepler (Compute 3.5)
+    #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+#define R(x, n)			((x) >> (n))
+#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define S0(x)			(ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)			(ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)			(ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)			(ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+__device__ void myriadgroestl_gpu_sha256(uint32_t *message)
+{
+	uint32_t W1[16];
+	uint32_t W2[16];
+
+	// Initialisiere die register a bis h mit der Hash-Tabelle
+	uint32_t regs[8];
+	uint32_t hash[8];
+
+	// pre
+#pragma unroll 8
+	for (int k=0; k < 8; k++)
+	{
+		regs[k] = myr_sha256_gpu_hashTable[k];
+		hash[k] = regs[k];
+	}
+	
+#pragma unroll 16
+	for(int k=0;k<16;k++)
+		W1[k] = SWAB32(message[k]);
+
+// Progress W1
+#pragma unroll 16
+	for(int j=0;j<16;j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+		#pragma unroll 7
+		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+// Progress W2...W3
+#pragma unroll 3
+	for(int k=0;k<3;k++)
+	{
+#pragma unroll 2
+		for(int j=0;j<2;j++)
+			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+#pragma unroll 5
+		for(int j=2;j<7;j++)
+			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+#pragma unroll 8
+		for(int j=7;j<15;j++)
+			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Rundenfunktion
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+			#pragma unroll 7
+			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+			W1[j] = W2[j];
+	}
+
+#pragma unroll 8
+	for(int k=0;k<8;k++)
+		hash[k] += regs[k];
+
+	/////
+	///// Zweite Runde (wegen Msg-Padding)
+	/////
+#pragma unroll 8
+	for(int k=0;k<8;k++)
+		regs[k] = hash[k];
+
+	W1[0] = SWAB32(0x80);
+#pragma unroll 14
+	for(int k=1;k<15;k++)
+		W1[k] = 0;
+	W1[15] = 512;
+
+// Progress W1
+#pragma unroll 16
+	for(int j=0;j<16;j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+		#pragma unroll 7
+		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+// Progress W2...W3
+#pragma unroll 3
+	for(int k=0;k<3;k++)
+	{
+#pragma unroll 2
+		for(int j=0;j<2;j++)
+			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+#pragma unroll 5
+		for(int j=2;j<7;j++)
+			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+#pragma unroll 8
+		for(int j=7;j<15;j++)
+			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Rundenfunktion
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+			#pragma unroll 7
+			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+			W1[j] = W2[j];
+	}
+
+#pragma unroll 8
+	for(int k=0;k<8;k++)
+		hash[k] += regs[k];
+
+	//// FERTIG
+
+#pragma unroll 8
+	for(int k=0;k<8;k++)
+		message[k] = SWAB32(hash[k]);
+}
+
+#define SPH_C32(x)    ((uint32_t)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    __byte_perm(x, 0, 0x4440)
+//((x) & 0xFF)
+#define B32_1(x)    __byte_perm(x, 0, 0x4441)
+//(((x) >> 8) & 0xFF)
+#define B32_2(x)    __byte_perm(x, 0, 0x4442)
+//(((x) >> 16) & 0xFF)
+#define B32_3(x)    __byte_perm(x, 0, 0x4443)
+//((x) >> 24)
+
+#if MAXWELL_OR_FEMRI
+#define USE_SHARED 1
+// Maxwell and Fermi cards get the best speed with SHARED access it seems.
+#if USE_SHARED
+#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
+#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
+#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
+#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
+#else
+#define T0up(x) tex1Dfetch(t0up1, x)
+#define T0dn(x) tex1Dfetch(t0dn1, x)
+#define T1up(x) tex1Dfetch(t1up1, x)
+#define T1dn(x) tex1Dfetch(t1dn1, x)
+#define T2up(x) tex1Dfetch(t2up1, x)
+#define T2dn(x) tex1Dfetch(t2dn1, x)
+#define T3up(x) tex1Dfetch(t3up1, x)
+#define T3dn(x) tex1Dfetch(t3dn1, x)
+#endif
+#else
+#define USE_SHARED 1
+// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
+#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+#define T0dn(x) tex1Dfetch(t0dn1, x)
+#define T1up(x) tex1Dfetch(t1up1, x)
+#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+#define T2up(x) tex1Dfetch(t2up1, x)
+#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+#define T3dn(x) tex1Dfetch(t3dn1, x)
+#endif
+
+texture<unsigned int, 1, cudaReadModeElementType> t0up1;
+texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t1up1;
+texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t2up1;
+texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t3up1;
+texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
+
+extern uint32_t T0up_cpu[];
+extern uint32_t T0dn_cpu[];
+extern uint32_t T1up_cpu[];
+extern uint32_t T1dn_cpu[];
+extern uint32_t T2up_cpu[];
+extern uint32_t T2dn_cpu[];
+extern uint32_t T3up_cpu[];
+extern uint32_t T3dn_cpu[];
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+
+__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
+{
+	uint32_t t[32];
+
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+		switch(r)
+		{
+			case 0:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
+			case 1:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
+			case 2:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
+			case 3:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
+			case 4:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
+			case 5:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
+			case 6:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
+			case 7:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
+			case 8:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
+			case 9:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
+			case 10:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
+			case 11:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
+			case 12:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
+			case 13:
+#pragma unroll 16
+				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
+		}
+
+        // RBTT
+#pragma unroll 16
+        for(int k=0;k<32;k+=2)
+        {
+            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
+            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
+            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
+            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
+        
+            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+
+            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+        }
+#pragma unroll 32
+        for(int k=0;k<32;k++)
+            a[k] = t[k];
+    }
+}
+
+__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
+{	
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+		uint32_t t[32];
+
+		switch(r)
+		{
+			case 0:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
+			case 1:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
+			case 2:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
+			case 3:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
+			case 4:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
+			case 5:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
+			case 6:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
+			case 7:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
+			case 8:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
+			case 9:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
+			case 10:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
+			case 11:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
+			case 12:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
+			case 13:
+	#pragma unroll 16
+				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
+		}
+
+        // RBTT
+#pragma unroll 16
+        for(int k=0;k<32;k+=2)
+        {
+            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
+            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
+            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
+            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
+        
+            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
+                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+
+            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
+                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        }
+#pragma unroll 32
+        for(int k=0;k<32;k++)
+            a[k] = t[k];
+    }
+}
+
+__global__ void 
+myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
+{
+#if USE_SHARED
+	extern __shared__ char mixtabs[];
+
+	if (threadIdx.x < 256)
+	{
+		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
+	}
+
+	__syncthreads();
+#endif
+
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (thread < threads)
+    {
+		// GROESTL
+		uint32_t message[32];
+		uint32_t state[32];
+
+#pragma unroll 32
+		for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
+
+		uint32_t nounce = startNounce + thread;
+		message[19] = SWAB32(nounce);
+
+#pragma unroll 32
+		for(int u=0;u<32;u++) state[u] = message[u];
+		state[31] ^= 0x20000;
+
+		// Perm
+#if USE_SHARED
+		myriadgroestl_perm_P(state, mixtabs);
+		state[31] ^= 0x20000;
+		myriadgroestl_perm_Q(message, mixtabs);
+#else
+		myriadgroestl_perm_P(state, NULL);
+		state[31] ^= 0x20000;
+		myriadgroestl_perm_Q(message, NULL);
+#endif
+#pragma unroll 32
+		for(int u=0;u<32;u++) state[u] ^= message[u];
+
+#pragma unroll 32
+		for(int u=0;u<32;u++) message[u] = state[u];
+
+#if USE_SHARED
+		myriadgroestl_perm_P(message, mixtabs);
+#else
+		myriadgroestl_perm_P(message, NULL);
+#endif
+
+#pragma unroll 32
+		for(int u=0;u<32;u++) state[u] ^= message[u];
+
+        uint32_t out_state[16];
+#pragma unroll 16
+		for(int u=0;u<16;u++) out_state[u] = state[u+16];
+        myriadgroestl_gpu_sha256(out_state);
+        
+        int i, position = -1;
+        bool rc = true;
+
+#pragma unroll 8
+        for (i = 7; i >= 0; i--) {
+            if (out_state[i] > pTarget[i]) {
+                if(position < i) {
+                    position = i;
+                    rc = false;
+                }
+             }
+             if (out_state[i] < pTarget[i]) {
+                if(position < i) {
+                    position = i;
+                    rc = true;
+                }
+             }
+        }
+
+        if(rc == true)
+            if(resNounce[0] > nounce)
+                resNounce[0] = nounce;
+    }
+}
+
+#define texDef(texname, texmem, texsource, texsize) \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+
+// Setup-Funktionen
+__host__ void myriadgroestl_cpu_init(int thr_id, int threads)
+{
+	cudaSetDevice(device_map[thr_id]);
+	
+	cudaMemcpyToSymbol(	myr_sha256_gpu_hashTable,
+						myr_sha256_cpu_hashTable,
+						sizeof(uint32_t) * 8 );
+
+	cudaMemcpyToSymbol(	myr_sha256_gpu_constantTable,
+						myr_sha256_cpu_constantTable,
+						sizeof(uint32_t) * 64 );
+
+    cudaGetDeviceProperties(&props, device_map[thr_id]);
+
+	// Texturen mit obigem Makro initialisieren
+	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+
+    // Speicher für Gewinner-Nonce belegen
+    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+}
+
+__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+{
+    // Nachricht expandieren und setzen
+    uint32_t msgBlock[32];
+
+    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memcpy(&msgBlock[0], data, 80);
+
+    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
+    // Unsere Nachricht hat 80 Byte
+    msgBlock[20] = 0x80;
+    msgBlock[31] = 0x01000000;
+
+    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+    // auf der GPU ausgeführt)
+
+    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+    cudaMemcpyToSymbol( myriadgroestl_gpu_msg,
+                        msgBlock,
+                        128);
+
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    cudaMemcpyToSymbol( pTarget,
+                        pTargetIn,
+                        sizeof(uint32_t) * 8 );
+}
+
+__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+{
+	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
+	// alle anderen mit 512 Threads.
+	int threadsperblock = (props.major >= 3) ? 768 : 512;
+
+    // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 block(threadsperblock);
+
+	// Größe des dynamischen Shared Memory Bereichs
+#if USE_SHARED
+	size_t shared_size = 8 * 256 * sizeof(uint32_t);
+#else
+	size_t shared_size = 0;
+#endif
+
+//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+    //fprintf(stderr, "ThrID: %d\n", thr_id);
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+
+    // Strategisches Sleep Kommando zur Senkung der CPU Last
+    MyStreamSynchronize(NULL, 0, thr_id);
+
+    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+}
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
new file mode 100644
index 0000000..4b2a231
--- /dev/null
+++ b/myriadgroestl.cpp
@@ -0,0 +1,106 @@
+#include "uint256.h"
+#include "sph/sph_groestl.h"
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <string.h>
+#include <stdint.h>
+#include <openssl/sha.h>
+
+extern bool opt_benchmark;
+
+void myriadgroestl_cpu_init(int thr_id, int threads);
+void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
+void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+
+#define SWAP32(x) \
+    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
+      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static void myriadhash(void *state, const void *input)
+{
+    sph_groestl512_context     ctx_groestl;
+
+    uint32_t hashA[16], hashB[16];
+
+    sph_groestl512_init(&ctx_groestl);
+    sph_groestl512 (&ctx_groestl, input, 80);
+    sph_groestl512_close(&ctx_groestl, hashA);
+
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+    SHA256_Update(&sha256,(unsigned char *)hashA, 64);
+    SHA256_Final((unsigned char *)hashB, &sha256);
+    memcpy(state, hashB, 32);
+}
+
+
+
+extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{	
+	uint32_t start_nonce = pdata[19]++;	
+	const uint32_t throughPut = 128 * 1024;
+//	const uint32_t throughPut = 1;
+	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
+
+	// TODO: entfernen für eine Release! Ist nur zum Testen!
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	const uint32_t Htarg = ptarget[7];
+
+	// init
+	static bool init[8] = { false, false, false, false, false, false, false, false };
+	if(!init[thr_id])
+	{
+#if BIG_DEBUG
+#else
+		myriadgroestl_cpu_init(thr_id, throughPut);
+#endif
+		init[thr_id] = true;
+	}
+	
+	uint32_t endiandata[32];
+	for (int kk=0; kk < 32; kk++)
+		be32enc(&endiandata[kk], pdata[kk]);
+
+	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
+	myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+	
+	do {
+		// GPU
+		uint32_t foundNounce = 0xFFFFFFFF;
+
+		myriadgroestl_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
+
+		if(foundNounce < 0xffffffff)
+		{
+			uint32_t tmpHash[8];
+			endiandata[19] = SWAP32(foundNounce);
+			myriadhash(tmpHash, endiandata);
+			if (tmpHash[7] <= Htarg && 
+					fulltest(tmpHash, ptarget)) {
+						pdata[19] = foundNounce;
+						*hashes_done = foundNounce - start_nonce;
+						free(outputHash);
+				return true;
+			} else {
+				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
+			}
+
+			foundNounce = 0xffffffff;
+		}
+
+		if (pdata[19] + throughPut < pdata[19])
+			pdata[19] = max_nonce;
+		else pdata[19] += throughPut;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = pdata[19] - start_nonce;
+	free(outputHash);
+	return 0;
+}
+
diff --git a/quark/.deps/.dirstamp b/quark/.deps/.dirstamp
new file mode 100644
index 0000000..e69de29
diff --git a/quark/.dirstamp b/quark/.dirstamp
new file mode 100644
index 0000000..e69de29
diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu
new file mode 100644
index 0000000..b80da04
--- /dev/null
+++ b/quark/cuda_quark_checkhash.cu
@@ -0,0 +1,107 @@
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+// Folgende Definitionen später durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// das Hash Target gegen das wir testen sollen
+__constant__ uint32_t pTarget[8];
+
+uint32_t *d_resNounce[8];
+uint32_t *h_resNounce[8];
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+__global__ void quark_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// bestimme den aktuellen Zähler
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *inpHash = &g_hash[16 * hashPosition];
+
+		uint32_t hash[8];
+#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			hash[i] = inpHash[i];
+
+		// kopiere Ergebnis
+		int i, position = -1;
+		bool rc = true;
+
+#pragma unroll 8
+		for (i = 7; i >= 0; i--) {
+			if (hash[i] > pTarget[i]) {
+				if(position < i) {
+					position = i;
+					rc = false;
+				}
+	 		}
+	 		if (hash[i] < pTarget[i]) {
+				if(position < i) {
+					position = i;
+					rc = true;
+				}
+	 		}
+		}
+
+		if(rc == true)
+			if(resNounce[0] > nounce)
+				resNounce[0] = nounce;
+	}
+}
+
+// Setup-Funktionen
+__host__ void quark_check_cpu_init(int thr_id, int threads)
+{
+    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
+    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
+}
+
+// Target Difficulty setzen
+__host__ void quark_check_cpu_setTarget(const void *ptarget)
+{
+	// die Message zur Berechnung auf der GPU
+	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
+
+__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+{
+	uint32_t result = 0xffffffff;
+	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+
+	const int threadsperblock = 256;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Größe des dynamischen Shared Memory Bereichs
+	size_t shared_size = 0;
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+
+	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	MyStreamSynchronize(NULL, order, thr_id);
+
+	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
+	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	// cudaMemcpy() ist asynchron!
+	cudaThreadSynchronize();
+	result = *h_resNounce[thr_id];
+
+	return result;
+}
diff --git a/sph/aes_helper.c b/sph/aes_helper.c
new file mode 100644
index 0000000..75b7cc6
--- /dev/null
+++ b/sph/aes_helper.c
@@ -0,0 +1,392 @@
+/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
+/*
+ * AES tables. This file is not meant to be compiled by itself; it
+ * is included by some hash function implementations. It contains
+ * the precomputed tables and helper macros for evaluating an AES
+ * round, optionally with a final XOR with a subkey.
+ *
+ * By default, this file defines the tables and macros for little-endian
+ * processing (i.e. it is assumed that the input bytes have been read
+ * from memory and assembled with the little-endian convention). If
+ * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
+ * when this file is included, then the tables and macros for big-endian
+ * processing are defined instead. The big-endian tables and macros have
+ * names distinct from the little-endian tables and macros, hence it is
+ * possible to have both simultaneously, by including this file twice
+ * (with and without the AES_BIG_ENDIAN macro).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include "sph_types.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+#if AES_BIG_ENDIAN
+
+#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
+                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define AES0      AES0_BE
+#define AES1      AES1_BE
+#define AES2      AES2_BE
+#define AES3      AES3_BE
+
+#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[((X0) >> 24) & 0xFF] \
+			^ AES1[((X1) >> 16) & 0xFF] \
+			^ AES2[((X2) >> 8) & 0xFF] \
+			^ AES3[(X3) & 0xFF] ^ (K0); \
+		(Y1) = AES0[((X1) >> 24) & 0xFF] \
+			^ AES1[((X2) >> 16) & 0xFF] \
+			^ AES2[((X3) >> 8) & 0xFF] \
+			^ AES3[(X0) & 0xFF] ^ (K1); \
+		(Y2) = AES0[((X2) >> 24) & 0xFF] \
+			^ AES1[((X3) >> 16) & 0xFF] \
+			^ AES2[((X0) >> 8) & 0xFF] \
+			^ AES3[(X1) & 0xFF] ^ (K2); \
+		(Y3) = AES0[((X3) >> 24) & 0xFF] \
+			^ AES1[((X0) >> 16) & 0xFF] \
+			^ AES2[((X1) >> 8) & 0xFF] \
+			^ AES3[(X2) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#else
+
+#define AESx(x)   SPH_C32(x)
+#define AES0      AES0_LE
+#define AES1      AES1_LE
+#define AES2      AES2_LE
+#define AES3      AES3_LE
+
+#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[(X0) & 0xFF] \
+			^ AES1[((X1) >> 8) & 0xFF] \
+			^ AES2[((X2) >> 16) & 0xFF] \
+			^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
+		(Y1) = AES0[(X1) & 0xFF] \
+			^ AES1[((X2) >> 8) & 0xFF] \
+			^ AES2[((X3) >> 16) & 0xFF] \
+			^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
+		(Y2) = AES0[(X2) & 0xFF] \
+			^ AES1[((X3) >> 8) & 0xFF] \
+			^ AES2[((X0) >> 16) & 0xFF] \
+			^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
+		(Y3) = AES0[(X3) & 0xFF] \
+			^ AES1[((X0) >> 8) & 0xFF] \
+			^ AES2[((X1) >> 16) & 0xFF] \
+			^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#endif
+
+/*
+ * The AES*[] tables allow us to perform a fast evaluation of an AES
+ * round; table AESi[] combines SubBytes for a byte at row i, and
+ * MixColumns for the column where that byte goes after ShiftRows.
+ */
+
+static const sph_u32 AES0[256] = {
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+static const sph_u32 AES1[256] = {
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+static const sph_u32 AES2[256] = {
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+static const sph_u32 AES3[256] = {
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blake.c b/sph/blake.c
similarity index 100%
rename from blake.c
rename to sph/blake.c
diff --git a/sph/bmw.c b/sph/bmw.c
new file mode 100644
index 0000000..718191d
--- /dev/null
+++ b/sph/bmw.c
@@ -0,0 +1,957 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_bmw.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0x00010203), SPH_C32(0x04050607),
+	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
+	SPH_C32(0x10111213), SPH_C32(0x14151617),
+	SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F),
+	SPH_C32(0x20212223), SPH_C32(0x24252627),
+	SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F),
+	SPH_C32(0x30313233), SPH_C32(0x34353637),
+	SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F),
+	SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F),
+	SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F),
+	SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F),
+	SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F),
+	SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F),
+	SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F),
+	SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#if SPH_64
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qab   do { \
+		unsigned u; \
+		sph_u64 Wb[16]; \
+		Wb[ 0] = Wb0; \
+		Wb[ 1] = Wb1; \
+		Wb[ 2] = Wb2; \
+		Wb[ 3] = Wb3; \
+		Wb[ 4] = Wb4; \
+		Wb[ 5] = Wb5; \
+		Wb[ 6] = Wb6; \
+		Wb[ 7] = Wb7; \
+		Wb[ 8] = Wb8; \
+		Wb[ 9] = Wb9; \
+		Wb[10] = Wb10; \
+		Wb[11] = Wb11; \
+		Wb[12] = Wb12; \
+		Wb[13] = Wb13; \
+		Wb[14] = Wb14; \
+		Wb[15] = Wb15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		unsigned u; \
+		for (u = 16; u < 18; u ++) \
+			qt[u] = expand1b(Qb, M, H, u); \
+		for (u = 18; u < 32; u ++) \
+			qt[u] = expand2b(Qb, M, H, u); \
+	} while (0)
+
+#else
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#endif
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#if SPH_64
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#endif
+
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#else
+	sph_u32 mv[16];
+
+	mv[ 0] = sph_dec32le_aligned(data +  0);
+	mv[ 1] = sph_dec32le_aligned(data +  4);
+	mv[ 2] = sph_dec32le_aligned(data +  8);
+	mv[ 3] = sph_dec32le_aligned(data + 12);
+	mv[ 4] = sph_dec32le_aligned(data + 16);
+	mv[ 5] = sph_dec32le_aligned(data + 20);
+	mv[ 6] = sph_dec32le_aligned(data + 24);
+	mv[ 7] = sph_dec32le_aligned(data + 28);
+	mv[ 8] = sph_dec32le_aligned(data + 32);
+	mv[ 9] = sph_dec32le_aligned(data + 36);
+	mv[10] = sph_dec32le_aligned(data + 40);
+	mv[11] = sph_dec32le_aligned(data + 44);
+	mv[12] = sph_dec32le_aligned(data + 48);
+	mv[13] = sph_dec32le_aligned(data + 52);
+	mv[14] = sph_dec32le_aligned(data + 56);
+	mv[15] = sph_dec32le_aligned(data + 60);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+
+#if SPH_64
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#else
+	sph_u64 mv[16];
+
+	mv[ 0] = sph_dec64le_aligned(data +   0);
+	mv[ 1] = sph_dec64le_aligned(data +   8);
+	mv[ 2] = sph_dec64le_aligned(data +  16);
+	mv[ 3] = sph_dec64le_aligned(data +  24);
+	mv[ 4] = sph_dec64le_aligned(data +  32);
+	mv[ 5] = sph_dec64le_aligned(data +  40);
+	mv[ 6] = sph_dec64le_aligned(data +  48);
+	mv[ 7] = sph_dec64le_aligned(data +  56);
+	mv[ 8] = sph_dec64le_aligned(data +  64);
+	mv[ 9] = sph_dec64le_aligned(data +  72);
+	mv[10] = sph_dec64le_aligned(data +  80);
+	mv[11] = sph_dec64le_aligned(data +  88);
+	mv[12] = sph_dec64le_aligned(data +  96);
+	mv[13] = sph_dec64le_aligned(data + 104);
+	mv[14] = sph_dec64le_aligned(data + 112);
+	mv[15] = sph_dec64le_aligned(data + 120);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+static void
+bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+static void
+bmw64(sph_bmw_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u64 htmp[16];
+	sph_u64 *h1, *h2;
+
+	sc->bit_count += (sph_u64)len << 3;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u64 *ht;
+
+			compress_big(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u64 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_big(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+	compress_big(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc64le_aligned(buf + 8 * u, h2[u]);
+	compress_big(buf, final_b, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+		sph_enc64le(out + 8 * u, h1[v]);
+}
+
+#endif
+
+/* see sph_bmw.h */
+void
+sph_bmw224_init(void *cc)
+{
+	bmw32_init(cc, IV224);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_close(void *cc, void *dst)
+{
+	sph_bmw224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 7);
+	sph_bmw224_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_init(void *cc)
+{
+	bmw32_init(cc, IV256);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_close(void *cc, void *dst)
+{
+	sph_bmw256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 8);
+	sph_bmw256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_bmw.h */
+void
+sph_bmw384_init(void *cc)
+{
+	bmw64_init(cc, IV384);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_close(void *cc, void *dst)
+{
+	sph_bmw384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 6);
+	sph_bmw384_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_init(void *cc)
+{
+	bmw64_init(cc, IV512);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_close(void *cc, void *dst)
+{
+	sph_bmw512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 8);
+	sph_bmw512_init(cc);
+}
+
+#endif
diff --git a/sph/cubehash.c b/sph/cubehash.c
new file mode 100644
index 0000000..9322fe1
--- /dev/null
+++ b/sph/cubehash.c
@@ -0,0 +1,723 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/sph/echo.c b/sph/echo.c
new file mode 100644
index 0000000..667e3f3
--- /dev/null
+++ b/sph/echo.c
@@ -0,0 +1,1031 @@
+/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * ECHO implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_echo.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO
+#define SPH_SMALL_FOOTPRINT_ECHO   1
+#endif
+
+/*
+ * Some measures tend to show that the 64-bit implementation offers
+ * better performance only on a "64-bit architectures", those which have
+ * actual 64-bit registers.
+ */
+#if !defined SPH_ECHO_64 && SPH_64_TRUE
+#define SPH_ECHO_64   1
+#endif
+
+/*
+ * We can use a 64-bit implementation only if a 64-bit type is available.
+ */
+#if !SPH_64
+#undef SPH_ECHO_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define T32   SPH_T32
+#define C32   SPH_C32
+#if SPH_64
+#define C64   SPH_C64
+#endif
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+#if SPH_ECHO_64
+
+#define DECL_STATE_SMALL   \
+	sph_u64 W[16][2];
+
+#define DECL_STATE_BIG   \
+	sph_u64 W[16][2];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u64 W[16][2],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u64 Wl = W[n][0];
+		sph_u64 Wh = W[n][1];
+		sph_u32 X0 = (sph_u32)Wl;
+		sph_u32 X1 = (sph_u32)(Wl >> 32);
+		sph_u32 X2 = (sph_u32)Wh;
+		sph_u32 X3 = (sph_u32)(Wh >> 32);
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+		W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32);
+		W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 X0 = (sph_u32)(X[0]); \
+		sph_u32 X1 = (sph_u32)(X[0] >> 32); \
+		sph_u32 X2 = (sph_u32)(X[1]); \
+		sph_u32 X3 = (sph_u32)(X[1] >> 32); \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \
+		X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \
+		X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 2; n ++) {
+		sph_u64 a = W[ia][n];
+		sph_u64 b = W[ib][n];
+		sph_u64 c = W[ic][n];
+		sph_u64 d = W[id][n];
+		sph_u64 ab = a ^ b;
+		sph_u64 bc = b ^ c;
+		sph_u64 cd = c ^ d;
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u64 a = W[ia][n]; \
+		sph_u64 b = W[ib][n]; \
+		sph_u64 c = W[ic][n]; \
+		sph_u64 d = W[id][n]; \
+		sph_u64 ab = a ^ b; \
+		sph_u64 bc = b ^ c; \
+		sph_u64 cd = c ^ d; \
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 8; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \
+				^ WW[u] ^ WW[u + 8] \
+				^ WW[u + 16] ^ WW[u + 24]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ WW[u] ^ WW[u + 16]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	sph_u32 W[16][4];
+
+#define DECL_STATE_BIG   \
+	sph_u32 W[16][4];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 4][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 4][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 8][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 8][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u32 W[16][4],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u32 *X = W[n];
+		sph_u32 Y0, Y1, Y2, Y3;
+		AES_ROUND_LE(X[0], X[1], X[2], X[3],
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X[0], X[1], X[2], X[3], \
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[b][2]; \
+		W[b][2] = W[c][2]; \
+		W[c][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[b][3]; \
+		W[b][3] = W[c][3]; \
+		W[c][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[c][2]; \
+		W[c][2] = tmp; \
+		tmp = W[b][2]; \
+		W[b][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[c][3]; \
+		W[c][3] = tmp; \
+		tmp = W[b][3]; \
+		W[b][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 4; n ++) {
+		sph_u32 a = W[ia][n];
+		sph_u32 b = W[ib][n];
+		sph_u32 c = W[ic][n];
+		sph_u32 d = W[id][n];
+		sph_u32 ab = a ^ b;
+		sph_u32 bc = b ^ c;
+		sph_u32 cd = c ^ d;
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U
+			^ ((ab & C32(0x7F7F7F7F)) << 1);
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U
+			^ ((bc & C32(0x7F7F7F7F)) << 1);
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U
+			^ ((cd & C32(0x7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u32 a = W[ia][n]; \
+		sph_u32 b = W[ib][n]; \
+		sph_u32 c = W[ic][n]; \
+		sph_u32 d = W[id][n]; \
+		sph_u32 ab = a ^ b; \
+		sph_u32 bc = b ^ c; \
+		sph_u32 cd = c ^ d; \
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \
+			^ ((ab & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \
+			^ ((bc & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \
+			^ ((cd & C32(0x7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+		MIX_COLUMN1(a, b, c, d, 2); \
+		MIX_COLUMN1(a, b, c, d, 3); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \
+				^ WW[u] ^ WW[u + 16] \
+				^ WW[u + 32] ^ WW[u + 48]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 32; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ WW[u] ^ WW[u + 32]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#endif
+
+#define INCR_COUNTER(sc, val)   do { \
+		sc->C0 = T32(sc->C0 + (sph_u32)(val)); \
+		if (sc->C0 < (sph_u32)(val)) { \
+			if ((sc->C1 = T32(sc->C1 + 1)) == 0) \
+				if ((sc->C2 = T32(sc->C2 + 1)) == 0) \
+					sc->C3 = T32(sc->C3 + 1); \
+		} \
+	} while (0)
+
+static void
+echo_small_init(sph_echo_small_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_big_init(sph_echo_big_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+	sc->u.Vb[4][0] = (sph_u64)out_len;
+	sc->u.Vb[4][1] = 0;
+	sc->u.Vb[5][0] = (sph_u64)out_len;
+	sc->u.Vb[5][1] = 0;
+	sc->u.Vb[6][0] = (sph_u64)out_len;
+	sc->u.Vb[6][1] = 0;
+	sc->u.Vb[7][0] = (sph_u64)out_len;
+	sc->u.Vb[7][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+	sc->u.Vs[4][0] = (sph_u32)out_len;
+	sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0;
+	sc->u.Vs[5][0] = (sph_u32)out_len;
+	sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0;
+	sc->u.Vs[6][0] = (sph_u32)out_len;
+	sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0;
+	sc->u.Vs[7][0] = (sph_u32)out_len;
+	sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_small_compress(sph_echo_small_context *sc)
+{
+	DECL_STATE_SMALL
+
+	COMPRESS_SMALL(sc);
+}
+
+static void
+echo_big_compress(sph_echo_big_context *sc)
+{
+	DECL_STATE_BIG
+
+	COMPRESS_BIG(sc);
+}
+
+static void
+echo_small_core(sph_echo_small_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1536);
+			echo_small_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_big_core(sph_echo_big_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1024);
+			echo_big_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[32];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_small_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_small_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_small_init(sc, out_size_w32 << 5);
+}
+
+static void
+echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[64];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_big_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_big_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_big_init(sc, out_size_w32 << 5);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_init(void *cc)
+{
+	echo_small_init(cc, 224);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_init(void *cc)
+{
+	echo_small_init(cc, 256);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_init(void *cc)
+{
+	echo_big_init(cc, 384);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_init(void *cc)
+{
+	echo_big_init(cc, 512);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/fugue.c b/sph/fugue.c
similarity index 100%
rename from fugue.c
rename to sph/fugue.c
diff --git a/groestl.c b/sph/groestl.c
similarity index 100%
rename from groestl.c
rename to sph/groestl.c
diff --git a/sph/jh.c b/sph/jh.c
new file mode 100644
index 0000000..4e26617
--- /dev/null
+++ b/sph/jh.c
@@ -0,0 +1,1107 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_jh.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c)   do { \
+		x3 = ~x3; \
+		x0 ^= (c) & ~x2; \
+		tmp = (c) ^ (x0 & x1); \
+		x0 ^= x2 & x3; \
+		x3 ^= ~x1 & x2; \
+		x1 ^= x0 & x2; \
+		x2 ^= x0 & ~x3; \
+		x0 ^= x1 | x3; \
+		x3 ^= x1 & x2; \
+		x1 ^= tmp & x0; \
+		x2 ^= tmp; \
+	} while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		x4 ^= x1; \
+		x5 ^= x2; \
+		x6 ^= x3 ^ x0; \
+		x7 ^= x0; \
+		x0 ^= x5; \
+		x1 ^= x6; \
+		x2 ^= x7 ^ x4; \
+		x3 ^= x4; \
+	} while (0)
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	sph_u64 tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H.wide[ 0]; \
+		h0l = (state)->H.wide[ 1]; \
+		h1h = (state)->H.wide[ 2]; \
+		h1l = (state)->H.wide[ 3]; \
+		h2h = (state)->H.wide[ 4]; \
+		h2l = (state)->H.wide[ 5]; \
+		h3h = (state)->H.wide[ 6]; \
+		h3l = (state)->H.wide[ 7]; \
+		h4h = (state)->H.wide[ 8]; \
+		h4l = (state)->H.wide[ 9]; \
+		h5h = (state)->H.wide[10]; \
+		h5l = (state)->H.wide[11]; \
+		h6h = (state)->H.wide[12]; \
+		h6l = (state)->H.wide[13]; \
+		h7h = (state)->H.wide[14]; \
+		h7l = (state)->H.wide[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.wide[ 0] = h0h; \
+		(state)->H.wide[ 1] = h0l; \
+		(state)->H.wide[ 2] = h1h; \
+		(state)->H.wide[ 3] = h1l; \
+		(state)->H.wide[ 4] = h2h; \
+		(state)->H.wide[ 5] = h2l; \
+		(state)->H.wide[ 6] = h3h; \
+		(state)->H.wide[ 7] = h3l; \
+		(state)->H.wide[ 8] = h4h; \
+		(state)->H.wide[ 9] = h4l; \
+		(state)->H.wide[10] = h5h; \
+		(state)->H.wide[11] = h5l; \
+		(state)->H.wide[12] = h6h; \
+		(state)->H.wide[13] = h6l; \
+		(state)->H.wide[14] = h7h; \
+		(state)->H.wide[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u64 m0h = dec64e_aligned(buf +  0); \
+	sph_u64 m0l = dec64e_aligned(buf +  8); \
+	sph_u64 m1h = dec64e_aligned(buf + 16); \
+	sph_u64 m1l = dec64e_aligned(buf + 24); \
+	sph_u64 m2h = dec64e_aligned(buf + 32); \
+	sph_u64 m2l = dec64e_aligned(buf + 40); \
+	sph_u64 m3h = dec64e_aligned(buf + 48); \
+	sph_u64 m3l = dec64e_aligned(buf + 56); \
+	h0h ^= m0h; \
+	h0l ^= m0l; \
+	h1h ^= m1h; \
+	h1l ^= m1l; \
+	h2h ^= m2h; \
+	h2l ^= m2l; \
+	h3h ^= m3h; \
+	h3l ^= m3l;
+
+#define INPUT_BUF2 \
+	h4h ^= m0h; \
+	h4l ^= m0l; \
+	h5h ^= m1h; \
+	h5l ^= m1l; \
+	h6h ^= m2h; \
+	h6l ^= m2l; \
+	h7h ^= m3h; \
+	h7l ^= m3l;
+
+static const sph_u64 IV224[] = {
+	C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7),
+	C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494),
+	C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc),
+	C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1),
+	C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099),
+	C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68),
+	C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0),
+	C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e)
+};
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+static const sph_u64 IV384[] = {
+	C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b),
+	C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267),
+	C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249),
+	C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1),
+	C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e),
+	C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda),
+	C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71),
+	C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f)
+};
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+static const sph_u32 C[] = {
+	C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a),
+	C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6),
+	C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0),
+	C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a),
+	C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a),
+	C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb),
+	C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980),
+	C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc),
+	C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515),
+	C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850),
+	C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d),
+	C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce),
+	C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6),
+	C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a),
+	C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb),
+	C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197),
+	C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d),
+	C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c),
+	C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f),
+	C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80),
+	C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e),
+	C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315),
+	C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20),
+	C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36),
+	C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974),
+	C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186),
+	C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667),
+	C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727),
+	C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f),
+	C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0),
+	C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281),
+	C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062),
+	C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00),
+	C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8),
+	C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228),
+	C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2),
+	C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385),
+	C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a),
+	C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704),
+	C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a),
+	C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09),
+	C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0),
+	C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31),
+	C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48),
+	C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1),
+	C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f),
+	C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701),
+	C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a),
+	C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b),
+	C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630),
+	C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456),
+	C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae),
+	C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68),
+	C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116),
+	C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220),
+	C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518),
+	C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0),
+	C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba),
+	C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea),
+	C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee),
+	C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe),
+	C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842),
+	C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315),
+	C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83),
+	C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034),
+	C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd),
+	C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49),
+	C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d),
+	C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0),
+	C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53),
+	C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492),
+	C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c),
+	C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6),
+	C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d),
+	C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2),
+	C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc),
+	C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c),
+	C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175),
+	C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3),
+	C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f),
+	C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09),
+	C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2),
+	C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6),
+	C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56),
+	C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc),
+	C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e),
+	C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8),
+	C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163),
+	C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4),
+	C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16),
+	C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3),
+	C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30),
+	C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8),
+	C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992),
+	C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57),
+	C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505),
+	C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284),
+	C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547),
+	C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807),
+	C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e),
+	C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2),
+	C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883),
+	C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226),
+	C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7),
+	C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6),
+	C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e),
+	C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93),
+	C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9),
+	C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633),
+	C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570),
+	C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1),
+	C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2)
+};
+
+#define Ceven_w3(r)   (C[((r) << 3) + 0])
+#define Ceven_w2(r)   (C[((r) << 3) + 1])
+#define Ceven_w1(r)   (C[((r) << 3) + 2])
+#define Ceven_w0(r)   (C[((r) << 3) + 3])
+#define Codd_w3(r)    (C[((r) << 3) + 4])
+#define Codd_w2(r)    (C[((r) << 3) + 5])
+#define Codd_w1(r)    (C[((r) << 3) + 6])
+#define Codd_w0(r)    (C[((r) << 3) + 7])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \
+		Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \
+		Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \
+		Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \
+			x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \
+		Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \
+			x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \
+		Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \
+			x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \
+		Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \
+			x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u32 t = (x ## 3 & (c)) << (n); \
+		x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \
+		t = (x ## 2 & (c)) << (n); \
+		x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \
+		t = (x ## 1 & (c)) << (n); \
+		x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \
+		t = (x ## 0 & (c)) << (n); \
+		x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C32(0x55555555),  1)
+#define W1(x)   Wz(x, SPH_C32(0x33333333),  2)
+#define W2(x)   Wz(x, SPH_C32(0x0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C32(0x00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C32(0x0000FFFF), 16)
+#define W5(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 2; \
+		x ## 2 = t; \
+		t = x ## 1; \
+		x ## 1 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+#define W6(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 1; \
+		x ## 1 = t; \
+		t = x ## 2; \
+		x ## 2 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \
+	sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \
+	sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \
+	sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \
+	sph_u32 tmp;
+
+#define READ_STATE(state)   do { \
+		h03 = (state)->H.narrow[ 0]; \
+		h02 = (state)->H.narrow[ 1]; \
+		h01 = (state)->H.narrow[ 2]; \
+		h00 = (state)->H.narrow[ 3]; \
+		h13 = (state)->H.narrow[ 4]; \
+		h12 = (state)->H.narrow[ 5]; \
+		h11 = (state)->H.narrow[ 6]; \
+		h10 = (state)->H.narrow[ 7]; \
+		h23 = (state)->H.narrow[ 8]; \
+		h22 = (state)->H.narrow[ 9]; \
+		h21 = (state)->H.narrow[10]; \
+		h20 = (state)->H.narrow[11]; \
+		h33 = (state)->H.narrow[12]; \
+		h32 = (state)->H.narrow[13]; \
+		h31 = (state)->H.narrow[14]; \
+		h30 = (state)->H.narrow[15]; \
+		h43 = (state)->H.narrow[16]; \
+		h42 = (state)->H.narrow[17]; \
+		h41 = (state)->H.narrow[18]; \
+		h40 = (state)->H.narrow[19]; \
+		h53 = (state)->H.narrow[20]; \
+		h52 = (state)->H.narrow[21]; \
+		h51 = (state)->H.narrow[22]; \
+		h50 = (state)->H.narrow[23]; \
+		h63 = (state)->H.narrow[24]; \
+		h62 = (state)->H.narrow[25]; \
+		h61 = (state)->H.narrow[26]; \
+		h60 = (state)->H.narrow[27]; \
+		h73 = (state)->H.narrow[28]; \
+		h72 = (state)->H.narrow[29]; \
+		h71 = (state)->H.narrow[30]; \
+		h70 = (state)->H.narrow[31]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.narrow[ 0] = h03; \
+		(state)->H.narrow[ 1] = h02; \
+		(state)->H.narrow[ 2] = h01; \
+		(state)->H.narrow[ 3] = h00; \
+		(state)->H.narrow[ 4] = h13; \
+		(state)->H.narrow[ 5] = h12; \
+		(state)->H.narrow[ 6] = h11; \
+		(state)->H.narrow[ 7] = h10; \
+		(state)->H.narrow[ 8] = h23; \
+		(state)->H.narrow[ 9] = h22; \
+		(state)->H.narrow[10] = h21; \
+		(state)->H.narrow[11] = h20; \
+		(state)->H.narrow[12] = h33; \
+		(state)->H.narrow[13] = h32; \
+		(state)->H.narrow[14] = h31; \
+		(state)->H.narrow[15] = h30; \
+		(state)->H.narrow[16] = h43; \
+		(state)->H.narrow[17] = h42; \
+		(state)->H.narrow[18] = h41; \
+		(state)->H.narrow[19] = h40; \
+		(state)->H.narrow[20] = h53; \
+		(state)->H.narrow[21] = h52; \
+		(state)->H.narrow[22] = h51; \
+		(state)->H.narrow[23] = h50; \
+		(state)->H.narrow[24] = h63; \
+		(state)->H.narrow[25] = h62; \
+		(state)->H.narrow[26] = h61; \
+		(state)->H.narrow[27] = h60; \
+		(state)->H.narrow[28] = h73; \
+		(state)->H.narrow[29] = h72; \
+		(state)->H.narrow[30] = h71; \
+		(state)->H.narrow[31] = h70; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u32 m03 = dec32e_aligned(buf +  0); \
+	sph_u32 m02 = dec32e_aligned(buf +  4); \
+	sph_u32 m01 = dec32e_aligned(buf +  8); \
+	sph_u32 m00 = dec32e_aligned(buf + 12); \
+	sph_u32 m13 = dec32e_aligned(buf + 16); \
+	sph_u32 m12 = dec32e_aligned(buf + 20); \
+	sph_u32 m11 = dec32e_aligned(buf + 24); \
+	sph_u32 m10 = dec32e_aligned(buf + 28); \
+	sph_u32 m23 = dec32e_aligned(buf + 32); \
+	sph_u32 m22 = dec32e_aligned(buf + 36); \
+	sph_u32 m21 = dec32e_aligned(buf + 40); \
+	sph_u32 m20 = dec32e_aligned(buf + 44); \
+	sph_u32 m33 = dec32e_aligned(buf + 48); \
+	sph_u32 m32 = dec32e_aligned(buf + 52); \
+	sph_u32 m31 = dec32e_aligned(buf + 56); \
+	sph_u32 m30 = dec32e_aligned(buf + 60); \
+	h03 ^= m03; \
+	h02 ^= m02; \
+	h01 ^= m01; \
+	h00 ^= m00; \
+	h13 ^= m13; \
+	h12 ^= m12; \
+	h11 ^= m11; \
+	h10 ^= m10; \
+	h23 ^= m23; \
+	h22 ^= m22; \
+	h21 ^= m21; \
+	h20 ^= m20; \
+	h33 ^= m33; \
+	h32 ^= m32; \
+	h31 ^= m31; \
+	h30 ^= m30;
+
+#define INPUT_BUF2 \
+	h43 ^= m03; \
+	h42 ^= m02; \
+	h41 ^= m01; \
+	h40 ^= m00; \
+	h53 ^= m13; \
+	h52 ^= m12; \
+	h51 ^= m11; \
+	h50 ^= m10; \
+	h63 ^= m23; \
+	h62 ^= m22; \
+	h61 ^= m21; \
+	h60 ^= m20; \
+	h73 ^= m33; \
+	h72 ^= m32; \
+	h71 ^= m31; \
+	h70 ^= m30;
+
+static const sph_u32 IV224[] = {
+	C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7),
+	C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494),
+	C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc),
+	C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1),
+	C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099),
+	C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68),
+	C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0),
+	C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e)
+};
+
+static const sph_u32 IV256[] = {
+	C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1),
+	C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03),
+	C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477),
+	C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8),
+	C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262),
+	C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c),
+	C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f),
+	C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769)
+};
+
+static const sph_u32 IV384[] = {
+	C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b),
+	C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267),
+	C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249),
+	C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1),
+	C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e),
+	C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda),
+	C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71),
+	C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f)
+};
+
+static const sph_u32 IV512[] = {
+	C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543),
+	C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361),
+	C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80),
+	C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7),
+	C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a),
+	C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199),
+	C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156),
+	C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b)
+};
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+#define E8   do { \
+		unsigned r, g; \
+		for (r = g = 0; r < 42; r ++) { \
+			S(h0, h2, h4, h6, Ceven_, r); \
+			S(h1, h3, h5, h7, Codd_, r); \
+			L(h0, h2, h4, h6, h1, h3, h5, h7); \
+			switch (g) { \
+			case 0: \
+				W0(h1); \
+				W0(h3); \
+				W0(h5); \
+				W0(h7); \
+				break; \
+			case 1: \
+				W1(h1); \
+				W1(h3); \
+				W1(h5); \
+				W1(h7); \
+				break; \
+			case 2: \
+				W2(h1); \
+				W2(h3); \
+				W2(h5); \
+				W2(h7); \
+				break; \
+			case 3: \
+				W3(h1); \
+				W3(h3); \
+				W3(h5); \
+				W3(h7); \
+				break; \
+			case 4: \
+				W4(h1); \
+				W4(h3); \
+				W4(h5); \
+				W4(h7); \
+				break; \
+			case 5: \
+				W5(h1); \
+				W5(h3); \
+				W5(h5); \
+				W5(h7); \
+				break; \
+			case 6: \
+				W6(h1); \
+				W6(h3); \
+				W6(h5); \
+				W6(h7); \
+				break; \
+			} \
+			if (++ g == 7) \
+				g = 0; \
+		} \
+	} while (0)
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+/*
+ * We are not aiming at a small footprint, but we are still using a
+ * 32-bit implementation. Full loop unrolling would smash the L1
+ * cache on some "big" architectures (32 kB L1 cache).
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#endif
+
+#endif
+
+static void
+jh_init(sph_jh_context *sc, const void *iv)
+{
+	sc->ptr = 0;
+#if SPH_JH_64
+	memcpy(sc->H.wide, iv, sizeof sc->H.wide);
+#else
+	memcpy(sc->H.narrow, iv, sizeof sc->H.narrow);
+#endif
+#if SPH_64
+	sc->block_count = 0;
+#else
+	sc->block_count_high = 0;
+	sc->block_count_low = 0;
+#endif
+}
+
+static void
+jh_core(sph_jh_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BUF1;
+			E8;
+			INPUT_BUF2;
+#if SPH_64
+			sc->block_count ++;
+#else
+			if ((sc->block_count_low = SPH_T32(
+				sc->block_count_low + 1)) == 0)
+				sc->block_count_high ++;
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+jh_close(sph_jh_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32, const void *iv)
+{
+	unsigned z;
+	unsigned char buf[128];
+	size_t numz, u;
+#if SPH_64
+	sph_u64 l0, l1;
+#else
+	sph_u32 l0, l1, l2, l3;
+#endif
+
+	z = 0x80 >> n;
+	buf[0] = ((ub & -z) | z) & 0xFF;
+	if (sc->ptr == 0 && n == 0) {
+		numz = 47;
+	} else {
+		numz = 111 - sc->ptr;
+	}
+	memset(buf + 1, 0, numz);
+#if SPH_64
+	l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T64(sc->block_count >> 55);
+	sph_enc64be(buf + numz + 1, l1);
+	sph_enc64be(buf + numz + 9, l0);
+#else
+	l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T32(sc->block_count_low >> 23)
+		+ SPH_T32(sc->block_count_high << 9);
+	l2 = SPH_T32(sc->block_count_high >> 23);
+	l3 = 0;
+	sph_enc32be(buf + numz +  1, l3);
+	sph_enc32be(buf + numz +  5, l2);
+	sph_enc32be(buf + numz +  9, l1);
+	sph_enc32be(buf + numz + 13, l0);
+#endif
+	jh_core(sc, buf, numz + 17);
+#if SPH_JH_64
+	for (u = 0; u < 8; u ++)
+		enc64e(buf + (u << 3), sc->H.wide[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(buf + (u << 2), sc->H.narrow[u + 16]);
+#endif
+	memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2);
+	jh_init(sc, iv);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_init(void *cc)
+{
+	jh_init(cc, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_init(void *cc)
+{
+	jh_init(cc, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_init(void *cc)
+{
+	jh_init(cc, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_init(void *cc)
+{
+	jh_init(cc, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 16, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 16, IV512);
+}
diff --git a/keccak.c b/sph/keccak.c
similarity index 100%
rename from keccak.c
rename to sph/keccak.c
diff --git a/sph/luffa.c b/sph/luffa.c
new file mode 100644
index 0000000..a761bea
--- /dev/null
+++ b/sph/luffa.c
@@ -0,0 +1,1426 @@
+/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */
+/*
+ * Luffa implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_luffa.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL
+#define SPH_LUFFA_PARALLEL   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 V_INIT[5][8] = {
+	{
+		SPH_C32(0x6d251e69), SPH_C32(0x44b051e0),
+		SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465),
+		SPH_C32(0x6e292011), SPH_C32(0x90152df4),
+		SPH_C32(0xee058139), SPH_C32(0xdef610bb)
+	}, {
+		SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256),
+		SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3),
+		SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3),
+		SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581)
+	}, {
+		SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781),
+		SPH_C32(0x04016ce5), SPH_C32(0xad659c05),
+		SPH_C32(0x0306194f), SPH_C32(0x666d1836),
+		SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7)
+	}, {
+		SPH_C32(0x858075d5), SPH_C32(0x36d79cce),
+		SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67),
+		SPH_C32(0x35870c6a), SPH_C32(0x57e9e923),
+		SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce)
+	}, {
+		SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22),
+		SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363),
+		SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1),
+		SPH_C32(0xb07224cc), SPH_C32(0x03e86cea)
+	}
+};
+
+static const sph_u32 RC00[8] = {
+	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
+	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
+	SPH_C32(0x1e00108f), SPH_C32(0x7800423d),
+	SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12)
+};
+
+static const sph_u32 RC04[8] = {
+	SPH_C32(0xe0337818), SPH_C32(0x441ba90d),
+	SPH_C32(0x7f34d442), SPH_C32(0x9389217f),
+	SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4),
+	SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d)
+};
+
+static const sph_u32 RC10[8] = {
+	SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae),
+	SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51),
+	SPH_C32(0x707a3d45), SPH_C32(0xaeb28562),
+	SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e)
+};
+
+static const sph_u32 RC14[8] = {
+	SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4),
+	SPH_C32(0xbd09caca), SPH_C32(0xf4272b28),
+	SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b),
+	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW010[8] = {
+	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
+	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
+	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
+	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+};
+
+static const sph_u64 RCW014[8] = {
+	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
+	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
+	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
+	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+};
+
+#endif
+
+static const sph_u32 RC20[8] = {
+	SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25),
+	SPH_C32(0x7ad8818f), SPH_C32(0x8438764a),
+	SPH_C32(0xbb6de032), SPH_C32(0xedb780c8),
+	SPH_C32(0xd9847356), SPH_C32(0xa2c78434)
+};
+
+static const sph_u32 RC24[8] = {
+	SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72),
+	SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7),
+	SPH_C32(0x78e38b9d), SPH_C32(0x27586719),
+	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
+};
+
+static const sph_u32 RC30[8] = {
+	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
+	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
+	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
+	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
+};
+
+static const sph_u32 RC34[8] = {
+	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
+	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
+	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
+	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW230[8] = {
+	SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25),
+	SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a),
+	SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8),
+	SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434)
+};
+
+
+static const sph_u64 RCW234[8] = {
+	SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72),
+	SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7),
+	SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719),
+	SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7)
+};
+
+#endif
+
+static const sph_u32 RC40[8] = {
+	SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa),
+	SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9),
+	SPH_C32(0x78602649), SPH_C32(0x8edae952),
+	SPH_C32(0x3b6ba548), SPH_C32(0xedae9520)
+};
+
+static const sph_u32 RC44[8] = {
+	SPH_C32(0x5090d577), SPH_C32(0x2d1925ab),
+	SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0),
+	SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3),
+	SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31)
+};
+
+#define DECL_TMP8(w) \
+	sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7;
+
+#define M2(d, s)   do { \
+		sph_u32 tmp = s ## 7; \
+		d ## 7 = s ## 6; \
+		d ## 6 = s ## 5; \
+		d ## 5 = s ## 4; \
+		d ## 4 = s ## 3 ^ tmp; \
+		d ## 3 = s ## 2 ^ tmp; \
+		d ## 2 = s ## 1; \
+		d ## 1 = s ## 0 ^ tmp; \
+		d ## 0 = tmp; \
+	} while (0)
+
+#define XOR(d, s1, s2)   do { \
+		d ## 0 = s1 ## 0 ^ s2 ## 0; \
+		d ## 1 = s1 ## 1 ^ s2 ## 1; \
+		d ## 2 = s1 ## 2 ^ s2 ## 2; \
+		d ## 3 = s1 ## 3 ^ s2 ## 3; \
+		d ## 4 = s1 ## 4 ^ s2 ## 4; \
+		d ## 5 = s1 ## 5 ^ s2 ## 5; \
+		d ## 6 = s1 ## 6 ^ s2 ## 6; \
+		d ## 7 = s1 ## 7 ^ s2 ## 7; \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define SUB_CRUMB_GEN(a0, a1, a2, a3, width)   do { \
+		sph_u ## width tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T ## width(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T ## width(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#define SUB_CRUMB(a0, a1, a2, a3)    SUB_CRUMB_GEN(a0, a1, a2, a3, 32)
+#define SUB_CRUMBW(a0, a1, a2, a3)   SUB_CRUMB_GEN(a0, a1, a2, a3, 64)
+
+
+#if 0
+
+#define ROL32W(x, n)   SPH_T64( \
+                       (((x) << (n)) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \
+                       | (((x) >> (32 - (n))) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n))))
+
+#define MIX_WORDW(u, v)   do { \
+		(v) ^= (u); \
+		(u) = ROL32W((u), 2) ^ (v); \
+		(v) = ROL32W((v), 14) ^ (u); \
+		(u) = ROL32W((u), 10) ^ (v); \
+		(v) = ROL32W((v), 1); \
+	} while (0)
+
+#endif
+
+#define MIX_WORDW(u, v)   do { \
+		sph_u32 ul, uh, vl, vh; \
+		(v) ^= (u); \
+		ul = SPH_T32((sph_u32)(u)); \
+		uh = SPH_T32((sph_u32)((u) >> 32)); \
+		vl = SPH_T32((sph_u32)(v)); \
+		vh = SPH_T32((sph_u32)((v) >> 32)); \
+		ul = SPH_ROTL32(ul, 2) ^ vl; \
+		vl = SPH_ROTL32(vl, 14) ^ ul; \
+		ul = SPH_ROTL32(ul, 10) ^ vl; \
+		vl = SPH_ROTL32(vl, 1); \
+		uh = SPH_ROTL32(uh, 2) ^ vh; \
+		vh = SPH_ROTL32(vh, 14) ^ uh; \
+		uh = SPH_ROTL32(uh, 10) ^ vh; \
+		vh = SPH_ROTL32(vh, 1); \
+		(u) = (sph_u64)ul | ((sph_u64)uh << 32); \
+		(v) = (sph_u64)vl | ((sph_u64)vh << 32); \
+	} while (0)
+
+#else
+
+#define SUB_CRUMB(a0, a1, a2, a3)   do { \
+		sph_u32 tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T32(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T32(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#endif
+
+#define MIX_WORD(u, v)   do { \
+		(v) ^= (u); \
+		(u) = SPH_ROTL32((u), 2) ^ (v); \
+		(v) = SPH_ROTL32((v), 14) ^ (u); \
+		(u) = SPH_ROTL32((u), 10) ^ (v); \
+		(v) = SPH_ROTL32((v), 1); \
+	} while (0)
+
+#define DECL_STATE3 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27;
+
+#define READ_STATE3(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+	} while (0)
+
+#define WRITE_STATE3(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+	} while (0)
+
+#define MI3   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(a, a, V2); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V0, M, V0); \
+		M2(M, M); \
+		XOR(V1, a, V1); \
+		XOR(V1, M, V1); \
+		M2(M, M); \
+		XOR(V2, a, V2); \
+		XOR(V2, M, V2); \
+	} while (0)
+
+#define TWEAK3   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P3   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK3; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P3   do { \
+		int r; \
+		TWEAK3; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE4 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37;
+
+#define READ_STATE4(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+	} while (0)
+
+#define WRITE_STATE4(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+	} while (0)
+
+#define MI4   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		M2(b, V0); \
+		XOR(b, b, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V0); \
+		XOR(V0, b, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+	} while (0)
+
+#define TWEAK4   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P4   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK4; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+	} while (0)
+
+#else
+
+#define P4   do { \
+		int r; \
+		TWEAK4; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE5 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \
+	sph_u32 V40, V41, V42, V43, V44, V45, V46, V47;
+
+#define READ_STATE5(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+		V40 = (state)->V[4][0]; \
+		V41 = (state)->V[4][1]; \
+		V42 = (state)->V[4][2]; \
+		V43 = (state)->V[4][3]; \
+		V44 = (state)->V[4][4]; \
+		V45 = (state)->V[4][5]; \
+		V46 = (state)->V[4][6]; \
+		V47 = (state)->V[4][7]; \
+	} while (0)
+
+#define WRITE_STATE5(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+		(state)->V[4][0] = V40; \
+		(state)->V[4][1] = V41; \
+		(state)->V[4][2] = V42; \
+		(state)->V[4][3] = V43; \
+		(state)->V[4][4] = V44; \
+		(state)->V[4][5] = V45; \
+		(state)->V[4][6] = V46; \
+		(state)->V[4][7] = V47; \
+	} while (0)
+
+#define MI5   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		XOR(a, a, V4); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		XOR(V4, a, V4); \
+		M2(b, V0); \
+		XOR(b, b, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V0); \
+		M2(V0, b); \
+		XOR(V0, V0, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, b); \
+		XOR(V0, V0, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+		M2(M, M); \
+		XOR(V4, V4, M); \
+	} while (0)
+
+#define TWEAK5   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+		V44 = SPH_ROTL32(V44, 4); \
+		V45 = SPH_ROTL32(V45, 4); \
+		V46 = SPH_ROTL32(V46, 4); \
+		V47 = SPH_ROTL32(V47, 4); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P5   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK5; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P5   do { \
+		int r; \
+		TWEAK5; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#endif
+
+static void
+luffa3(sph_luffa224_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE3(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI3;
+			P3;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE3(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE3(sc);
+	for (i = 0; i < 2; i ++) {
+		MI3;
+		P3;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	out = dst;
+	sph_enc32be(out +  0, V00 ^ V10 ^ V20);
+	sph_enc32be(out +  4, V01 ^ V11 ^ V21);
+	sph_enc32be(out +  8, V02 ^ V12 ^ V22);
+	sph_enc32be(out + 12, V03 ^ V13 ^ V23);
+	sph_enc32be(out + 16, V04 ^ V14 ^ V24);
+	sph_enc32be(out + 20, V05 ^ V15 ^ V25);
+	sph_enc32be(out + 24, V06 ^ V16 ^ V26);
+	if (out_size_w32 > 7)
+		sph_enc32be(out + 28, V07 ^ V17 ^ V27);
+}
+
+static void
+luffa4(sph_luffa384_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE4(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI4;
+			P4;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE4(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE4(sc);
+	for (i = 0; i < 3; i ++) {
+		MI4;
+		P4;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33);
+			break;
+		}
+	}
+}
+
+static void
+luffa5(sph_luffa512_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE5(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI5;
+			P5;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE5(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE5(sc);
+	for (i = 0; i < 3; i ++) {
+		MI5;
+		P5;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		}
+	}
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_init(void *cc)
+{
+	sph_luffa224_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_close(void *cc, void *dst)
+{
+	sph_luffa224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 7);
+	sph_luffa224_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_init(void *cc)
+{
+	sph_luffa256_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_close(void *cc, void *dst)
+{
+	sph_luffa256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 8);
+	sph_luffa256_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_init(void *cc)
+{
+	sph_luffa384_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384(void *cc, const void *data, size_t len)
+{
+	luffa4(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_close(void *cc, void *dst)
+{
+	sph_luffa384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa4_close(cc, ub, n, dst);
+	sph_luffa384_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_init(void *cc)
+{
+	sph_luffa512_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512(void *cc, const void *data, size_t len)
+{
+	luffa5(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_close(void *cc, void *dst)
+{
+	sph_luffa512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa5_close(cc, ub, n, dst);
+	sph_luffa512_init(cc);
+}
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sph/shavite.c b/sph/shavite.c
new file mode 100644
index 0000000..85074f3
--- /dev/null
+++ b/sph/shavite.c
@@ -0,0 +1,1764 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shavite.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
+	C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6),
+	C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47),
+	C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147),
+	C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C),
+	C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+/*
+ * This is the code needed to match the "reference implementation" as
+ * published on Nov 23rd, 2009, instead of the published specification.
+ * 
+
+#define AES_BIG_ENDIAN   1
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0),
+	C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5),
+	C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4),
+	C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9),
+	C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B),
+	C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47),
+	C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186),
+	C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316),
+	C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+ */
+
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 rk[144];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 64);
+#else
+	for (u = 0; u < 16; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 16;
+	for (r = 0; r < 4; r ++) {
+		for (s = 0; s < 2; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 16) {
+				rk[ 16] ^= sc->count0;
+				rk[ 17] ^= SPH_T32(~sc->count1);
+			} else if (u == 56) {
+				rk[ 57] ^= sc->count1;
+				rk[ 58] ^= SPH_T32(~sc->count0);
+			}
+			u += 4;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 84) {
+				rk[ 86] ^= sc->count1;
+				rk[ 87] ^= SPH_T32(~sc->count0);
+			} else if (u == 124) {
+				rk[124] ^= sc->count0;
+				rk[127] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		for (s = 0; s < 4; s ++) {
+			rk[u + 0] = rk[u - 16] ^ rk[u - 3];
+			rk[u + 1] = rk[u - 15] ^ rk[u - 2];
+			rk[u + 2] = rk[u - 14] ^ rk[u - 1];
+			rk[u + 3] = rk[u - 13] ^ rk[u - 0];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	u = 0;
+	for (r = 0; r < 6; r ++) {
+		sph_u32 x0, x1, x2, x3;
+
+		x0 = p4 ^ rk[u ++];
+		x1 = p5 ^ rk[u ++];
+		x2 = p6 ^ rk[u ++];
+		x3 = p7 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+
+		x0 = p0 ^ rk[u ++];
+		x1 = p1 ^ rk[u ++];
+		x2 = p2 ^ rk[u ++];
+		x3 = p3 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
+	sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	/* round 0 */
+	rk0 = sph_dec32le_aligned((const unsigned char *)msg +  0);
+	x0 = p4 ^ rk0;
+	rk1 = sph_dec32le_aligned((const unsigned char *)msg +  4);
+	x1 = p5 ^ rk1;
+	rk2 = sph_dec32le_aligned((const unsigned char *)msg +  8);
+	x2 = p6 ^ rk2;
+	rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12);
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16);
+	x0 ^= rk4;
+	rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20);
+	x1 ^= rk5;
+	rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24);
+	x2 ^= rk6;
+	rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28);
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32);
+	x0 ^= rk8;
+	rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36);
+	x1 ^= rk9;
+	rkA = sph_dec32le_aligned((const unsigned char *)msg + 40);
+	x2 ^= rkA;
+	rkB = sph_dec32le_aligned((const unsigned char *)msg + 44);
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 1 */
+	rkC = sph_dec32le_aligned((const unsigned char *)msg + 48);
+	x0 = p0 ^ rkC;
+	rkD = sph_dec32le_aligned((const unsigned char *)msg + 52);
+	x1 = p1 ^ rkD;
+	rkE = sph_dec32le_aligned((const unsigned char *)msg + 56);
+	x2 = p2 ^ rkE;
+	rkF = sph_dec32le_aligned((const unsigned char *)msg + 60);
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC ^ sc->count0;
+	rk1 ^= rkD ^ SPH_T32(~sc->count1);
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 2 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 3 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 4 */
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 = p4 ^ rk0;
+	x1 = p5 ^ rk1;
+	x2 = p6 ^ rk2;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5 ^ sc->count1;
+	rkA ^= rk6 ^ SPH_T32(~sc->count0);
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 5 */
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 = p0 ^ rkC;
+	x1 = p1 ^ rkD;
+	x2 = p2 ^ rkE;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 6 */
+	rk8 ^= rk5;
+	x0 = p4 ^ rk8;
+	rk9 ^= rk6;
+	x1 = p5 ^ rk9;
+	rkA ^= rk7;
+	x2 = p6 ^ rkA;
+	rkB ^= rk8;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 7 */
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2 ^ sc->count1;
+	rk7 ^= rk3 ^ SPH_T32(~sc->count0);
+	x0 = p0 ^ rk4;
+	x1 = p1 ^ rk5;
+	x2 = p2 ^ rk6;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 8 */
+	rk0 ^= rkD;
+	x0 = p4 ^ rk0;
+	rk1 ^= rkE;
+	x1 = p5 ^ rk1;
+	rk2 ^= rkF;
+	x2 = p6 ^ rk2;
+	rk3 ^= rk0;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 9 */
+	rkC ^= rk9;
+	x0 = p0 ^ rkC;
+	rkD ^= rkA;
+	x1 = p1 ^ rkD;
+	rkE ^= rkB;
+	x2 = p2 ^ rkE;
+	rkF ^= rkC;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 10 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8 ^ sc->count0;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB ^ SPH_T32(~sc->count1);
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 11 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07;
+	sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
+	sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
+	sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
+	int r;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	/* round 0 */
+	rk00 = sph_dec32le_aligned((const unsigned char *)msg +   0);
+	x0 = p4 ^ rk00;
+	rk01 = sph_dec32le_aligned((const unsigned char *)msg +   4);
+	x1 = p5 ^ rk01;
+	rk02 = sph_dec32le_aligned((const unsigned char *)msg +   8);
+	x2 = p6 ^ rk02;
+	rk03 = sph_dec32le_aligned((const unsigned char *)msg +  12);
+	x3 = p7 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk04 = sph_dec32le_aligned((const unsigned char *)msg +  16);
+	x0 ^= rk04;
+	rk05 = sph_dec32le_aligned((const unsigned char *)msg +  20);
+	x1 ^= rk05;
+	rk06 = sph_dec32le_aligned((const unsigned char *)msg +  24);
+	x2 ^= rk06;
+	rk07 = sph_dec32le_aligned((const unsigned char *)msg +  28);
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk08 = sph_dec32le_aligned((const unsigned char *)msg +  32);
+	x0 ^= rk08;
+	rk09 = sph_dec32le_aligned((const unsigned char *)msg +  36);
+	x1 ^= rk09;
+	rk0A = sph_dec32le_aligned((const unsigned char *)msg +  40);
+	x2 ^= rk0A;
+	rk0B = sph_dec32le_aligned((const unsigned char *)msg +  44);
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0C = sph_dec32le_aligned((const unsigned char *)msg +  48);
+	x0 ^= rk0C;
+	rk0D = sph_dec32le_aligned((const unsigned char *)msg +  52);
+	x1 ^= rk0D;
+	rk0E = sph_dec32le_aligned((const unsigned char *)msg +  56);
+	x2 ^= rk0E;
+	rk0F = sph_dec32le_aligned((const unsigned char *)msg +  60);
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk10 = sph_dec32le_aligned((const unsigned char *)msg +  64);
+	x0 = pC ^ rk10;
+	rk11 = sph_dec32le_aligned((const unsigned char *)msg +  68);
+	x1 = pD ^ rk11;
+	rk12 = sph_dec32le_aligned((const unsigned char *)msg +  72);
+	x2 = pE ^ rk12;
+	rk13 = sph_dec32le_aligned((const unsigned char *)msg +  76);
+	x3 = pF ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk14 = sph_dec32le_aligned((const unsigned char *)msg +  80);
+	x0 ^= rk14;
+	rk15 = sph_dec32le_aligned((const unsigned char *)msg +  84);
+	x1 ^= rk15;
+	rk16 = sph_dec32le_aligned((const unsigned char *)msg +  88);
+	x2 ^= rk16;
+	rk17 = sph_dec32le_aligned((const unsigned char *)msg +  92);
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk18 = sph_dec32le_aligned((const unsigned char *)msg +  96);
+	x0 ^= rk18;
+	rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100);
+	x1 ^= rk19;
+	rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104);
+	x2 ^= rk1A;
+	rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108);
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112);
+	x0 ^= rk1C;
+	rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116);
+	x1 ^= rk1D;
+	rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120);
+	x2 ^= rk1E;
+	rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124);
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	for (r = 0; r < 3; r ++) {
+		/* round 1, 5, 9 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		if (r == 0) {
+			rk00 ^= sc->count0;
+			rk01 ^= sc->count1;
+			rk02 ^= sc->count2;
+			rk03 ^= SPH_T32(~sc->count3);
+		}
+		x0 = p0 ^ rk00;
+		x1 = p1 ^ rk01;
+		x2 = p2 ^ rk02;
+		x3 = p3 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		if (r == 1) {
+			rk04 ^= sc->count3;
+			rk05 ^= sc->count2;
+			rk06 ^= sc->count1;
+			rk07 ^= SPH_T32(~sc->count0);
+		}
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p8 ^ rk10;
+		x1 = p9 ^ rk11;
+		x2 = pA ^ rk12;
+		x3 = pB ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		if (r == 2) {
+			rk1C ^= sc->count2;
+			rk1D ^= sc->count3;
+			rk1E ^= sc->count0;
+			rk1F ^= SPH_T32(~sc->count1);
+		}
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		/* round 2, 6, 10 */
+		rk00 ^= rk19;
+		x0 = pC ^ rk00;
+		rk01 ^= rk1A;
+		x1 = pD ^ rk01;
+		rk02 ^= rk1B;
+		x2 = pE ^ rk02;
+		rk03 ^= rk1C;
+		x3 = pF ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+		rk10 ^= rk09;
+		x0 = p4 ^ rk10;
+		rk11 ^= rk0A;
+		x1 = p5 ^ rk11;
+		rk12 ^= rk0B;
+		x2 = p6 ^ rk12;
+		rk13 ^= rk0C;
+		x3 = p7 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		/* round 3, 7, 11 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		x0 = p8 ^ rk00;
+		x1 = p9 ^ rk01;
+		x2 = pA ^ rk02;
+		x3 = pB ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p0 ^ rk10;
+		x1 = p1 ^ rk11;
+		x2 = p2 ^ rk12;
+		x3 = p3 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		/* round 4, 8, 12 */
+		rk00 ^= rk19;
+		x0 = p4 ^ rk00;
+		rk01 ^= rk1A;
+		x1 = p5 ^ rk01;
+		rk02 ^= rk1B;
+		x2 = p6 ^ rk02;
+		rk03 ^= rk1C;
+		x3 = p7 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		rk10 ^= rk09;
+		x0 = pC ^ rk10;
+		rk11 ^= rk0A;
+		x1 = pD ^ rk11;
+		rk12 ^= rk0B;
+		x2 = pE ^ rk12;
+		rk13 ^= rk0C;
+		x3 = pF ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+	}
+	/* round 13 */
+	KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+	rk00 ^= rk1C;
+	rk01 ^= rk1D;
+	rk02 ^= rk1E;
+	rk03 ^= rk1F;
+	x0 = p0 ^ rk00;
+	x1 = p1 ^ rk01;
+	x2 = p2 ^ rk02;
+	x3 = p3 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+	rk04 ^= rk00;
+	rk05 ^= rk01;
+	rk06 ^= rk02;
+	rk07 ^= rk03;
+	x0 ^= rk04;
+	x1 ^= rk05;
+	x2 ^= rk06;
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+	rk08 ^= rk04;
+	rk09 ^= rk05;
+	rk0A ^= rk06;
+	rk0B ^= rk07;
+	x0 ^= rk08;
+	x1 ^= rk09;
+	x2 ^= rk0A;
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+	rk0C ^= rk08;
+	rk0D ^= rk09;
+	rk0E ^= rk0A;
+	rk0F ^= rk0B;
+	x0 ^= rk0C;
+	x1 ^= rk0D;
+	x2 ^= rk0E;
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+	rk10 ^= rk0C;
+	rk11 ^= rk0D;
+	rk12 ^= rk0E;
+	rk13 ^= rk0F;
+	x0 = p8 ^ rk10;
+	x1 = p9 ^ rk11;
+	x2 = pA ^ rk12;
+	x3 = pB ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+	rk14 ^= rk10;
+	rk15 ^= rk11;
+	rk16 ^= rk12;
+	rk17 ^= rk13;
+	x0 ^= rk14;
+	x1 ^= rk15;
+	x2 ^= rk16;
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+	rk18 ^= rk14 ^ sc->count1;
+	rk19 ^= rk15 ^ sc->count0;
+	rk1A ^= rk16 ^ sc->count3;
+	rk1B ^= rk17 ^ SPH_T32(~sc->count2);
+	x0 ^= rk18;
+	x1 ^= rk19;
+	x2 ^= rk1A;
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+	rk1C ^= rk18;
+	rk1D ^= rk19;
+	rk1E ^= rk1A;
+	rk1F ^= rk1B;
+	x0 ^= rk1C;
+	x1 ^= rk1D;
+	x2 ^= rk1E;
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p8;
+	sc->h[0x1] ^= p9;
+	sc->h[0x2] ^= pA;
+	sc->h[0x3] ^= pB;
+	sc->h[0x4] ^= pC;
+	sc->h[0x5] ^= pD;
+	sc->h[0x6] ^= pE;
+	sc->h[0x7] ^= pF;
+	sc->h[0x8] ^= p0;
+	sc->h[0x9] ^= p1;
+	sc->h[0xA] ^= p2;
+	sc->h[0xB] ^= p3;
+	sc->h[0xC] ^= p4;
+	sc->h[0xD] ^= p5;
+	sc->h[0xE] ^= p6;
+	sc->h[0xF] ^= p7;
+}
+
+#endif
+
+static void
+shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+}
+
+static void
+shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0)
+				sc->count1 = SPH_T32(sc->count1 + 1);
+			c256(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_small_close(sph_shavite_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 53);
+		sc->count0 = sc->count1 = 0;
+	} else if (ptr < 54) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 54 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 64 - ptr);
+		c256(sc, buf);
+		memset(buf, 0, 54);
+		sc->count0 = sc->count1 = 0;
+	}
+	sph_enc32le(buf + 54, count0);
+	sph_enc32le(buf + 58, count1);
+	buf[62] = out_size_w32 << 5;
+	buf[63] = out_size_w32 >> 3;
+	c256(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+static void
+shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_close(sph_shavite_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = out_size_w32 << 5;
+	buf[127] = out_size_w32 >> 3;
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_init(void *cc)
+{
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_init(void *cc)
+{
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_init(void *cc)
+{
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_init(void *cc)
+{
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 16);
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 16);
+	shavite_big_init(cc, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sph/simd.c b/sph/simd.c
new file mode 100644
index 0000000..2c80626
--- /dev/null
+++ b/sph/simd.c
@@ -0,0 +1,1799 @@
+/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SIMD implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_simd.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
+#define SPH_SMALL_FOOTPRINT_SIMD   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+typedef sph_u32 u32;
+typedef sph_s32 s32;
+#define C32     SPH_C32
+#define T32     SPH_T32
+#define ROL32   SPH_ROTL32
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+/*
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
+ */
+static const s32 alpha_tab[] = {
+	  1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
+	190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
+	120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
+	184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
+	  8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
+	235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
+	189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
+	187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
+	 64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
+	 81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
+	227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
+	211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
+	255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
+	134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
+	 17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
+	146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
+	241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
+	 44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
+	136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
+	140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
+	129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
+	 95,  40,  98, 163
+};
+
+/*
+ * Ranges:
+ *   REDS1: from -32768..98302 to -383..383
+ *   REDS2: from -2^31..2^31-1 to -32768..98302
+ */
+#define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
+#define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))
+
+/*
+ * If, upon entry, the values of q[] are all in the -N..N range (where
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
+ *
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
+ */
+#define FFT_LOOP(rb, hk, as, id)   do { \
+		size_t u, v; \
+		s32 m = q[(rb)]; \
+		s32 n = q[(rb) + (hk)]; \
+		q[(rb)] = m + n; \
+		q[(rb) + (hk)] = m - n; \
+		u = v = 0; \
+		goto id; \
+		for (; u < (hk); u += 4, v += 4 * (as)) { \
+			s32 t; \
+			m = q[(rb) + u + 0]; \
+			n = q[(rb) + u + 0 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 0 * (as)]); \
+			q[(rb) + u + 0] = m + t; \
+			q[(rb) + u + 0 + (hk)] = m - t; \
+		id: \
+			m = q[(rb) + u + 1]; \
+			n = q[(rb) + u + 1 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 1 * (as)]); \
+			q[(rb) + u + 1] = m + t; \
+			q[(rb) + u + 1 + (hk)] = m - t; \
+			m = q[(rb) + u + 2]; \
+			n = q[(rb) + u + 2 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 2 * (as)]); \
+			q[(rb) + u + 2] = m + t; \
+			q[(rb) + u + 2 + (hk)] = m - t; \
+			m = q[(rb) + u + 3]; \
+			n = q[(rb) + u + 3 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 3 * (as)]); \
+			q[(rb) + u + 3] = m + t; \
+			q[(rb) + u + 3 + (hk)] = m - t; \
+		} \
+	} while (0)
+
+/*
+ * Output ranges:
+ *   d0:   min=    0   max= 1020
+ *   d1:   min=  -67   max= 4587
+ *   d2:   min=-4335   max= 4335
+ *   d3:   min=-4147   max=  507
+ *   d4:   min= -510   max=  510
+ *   d5:   min= -252   max= 4402
+ *   d6:   min=-4335   max= 4335
+ *   d7:   min=-4332   max=  322
+ */
+#define FFT8(xb, xs, d)   do { \
+		s32 x0 = x[(xb)]; \
+		s32 x1 = x[(xb) + (xs)]; \
+		s32 x2 = x[(xb) + 2 * (xs)]; \
+		s32 x3 = x[(xb) + 3 * (xs)]; \
+		s32 a0 = x0 + x2; \
+		s32 a1 = x0 + (x2 << 4); \
+		s32 a2 = x0 - x2; \
+		s32 a3 = x0 - (x2 << 4); \
+		s32 b0 = x1 + x3; \
+		s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
+		s32 b2 = (x1 << 4) - (x3 << 4); \
+		s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
+		d ## 0 = a0 + b0; \
+		d ## 1 = a1 + b1; \
+		d ## 2 = a2 + b2; \
+		d ## 3 = a3 + b3; \
+		d ## 4 = a0 - b0; \
+		d ## 5 = a1 - b1; \
+		d ## 6 = a2 - b2; \
+		d ## 7 = a3 - b3; \
+	} while (0)
+
+/*
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
+ * to some shifting.
+ *
+ * Output: within -591471..591723
+ */
+#define FFT16(xb, xs, rb)   do { \
+		s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
+		s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
+		FFT8(xb, (xs) << 1, d1_); \
+		FFT8((xb) + (xs), (xs) << 1, d2_); \
+		q[(rb) +  0] = d1_0 + d2_0; \
+		q[(rb) +  1] = d1_1 + (d2_1 << 1); \
+		q[(rb) +  2] = d1_2 + (d2_2 << 2); \
+		q[(rb) +  3] = d1_3 + (d2_3 << 3); \
+		q[(rb) +  4] = d1_4 + (d2_4 << 4); \
+		q[(rb) +  5] = d1_5 + (d2_5 << 5); \
+		q[(rb) +  6] = d1_6 + (d2_6 << 6); \
+		q[(rb) +  7] = d1_7 + (d2_7 << 7); \
+		q[(rb) +  8] = d1_0 - d2_0; \
+		q[(rb) +  9] = d1_1 - (d2_1 << 1); \
+		q[(rb) + 10] = d1_2 - (d2_2 << 2); \
+		q[(rb) + 11] = d1_3 - (d2_3 << 3); \
+		q[(rb) + 12] = d1_4 - (d2_4 << 4); \
+		q[(rb) + 13] = d1_5 - (d2_5 << 5); \
+		q[(rb) + 14] = d1_6 - (d2_6 << 6); \
+		q[(rb) + 15] = d1_7 - (d2_7 << 7); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 1183446
+ */
+#define FFT32(xb, xs, rb, id)   do { \
+		FFT16(xb, (xs) << 1, rb); \
+		FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
+		FFT_LOOP(rb, 16, 8, id); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 2366892
+ */
+#define FFT64(xb, xs, rb, id)   do { \
+		FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
+		FFT_LOOP(rb, 32, 4, id); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+static void
+fft32(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT16(0, xd, 0);
+	FFT16(xs, xd, 16);
+	FFT_LOOP(0, 16, 8, label_);
+}
+
+#define FFT128(xb, xs, rb, id)   do { \
+		fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
+		fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
+		FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
+		fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
+		fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
+		FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
+	} while (0)
+
+#else
+
+/*
+ * Output range: |q| <= 4733784
+ */
+#define FFT128(xb, xs, rb, id)   do { \
+		FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
+		FFT_LOOP(rb, 64, 2, id); \
+	} while (0)
+
+#endif
+
+/*
+ * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
+ * function which does not fit in the 32 kB L1 cache of a typical x86
+ * Intel. We therefore add a function call layer at the FFT64 level.
+ */
+
+static void
+fft64(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT32(0, xd, 0, label_a);
+	FFT32(xs, xd, 32, label_b);
+	FFT_LOOP(0, 32, 4, label_);
+}
+
+/*
+ * Output range: |q| <= 9467568
+ */
+#define FFT256(xb, xs, rb, id)   do { \
+		fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
+		fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
+		fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
+		fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
+		FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
+		FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
+	} while (0)
+
+/*
+ * alpha^(127*i) mod 257
+ */
+static const unsigned short yoff_s_n[] = {
+	  1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
+	 15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
+	225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
+	 34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
+	253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
+	197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
+	128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
+	121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
+	 16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
+	240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
+	  2, 196, 190, 116,  60, 226,  46, 139
+};
+
+/*
+ * alpha^(127*i) + alpha^(125*i) mod 257
+ */
+static const unsigned short yoff_s_f[] = {
+	  2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
+	 49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
+	 96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
+	 17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
+	189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
+	 77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
+	160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
+	181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
+	  0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
+	210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
+	 10, 146, 117, 251, 180, 247, 236, 108
+};
+
+/*
+ * beta^(255*i) mod 257
+ */
+static const unsigned short yoff_b_n[] = {
+	  1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
+	 23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
+	 15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
+	 88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
+	225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
+	 35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
+	 34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
+	 11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
+	253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
+	165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
+	197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
+	162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
+	128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
+	117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
+	121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
+	213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
+	 16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
+	111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
+	240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
+	123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
+	  2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
+	 46,  45, 139,  41
+};
+
+/*
+ * beta^(255*i) + beta^(253*i) mod 257
+ */
+static const unsigned short yoff_b_f[] = {
+	  2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
+	111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
+	 49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
+	253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
+	 96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
+	248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
+	 17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
+	 57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
+	189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
+	187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
+	 77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
+	139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
+	160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
+	106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
+	181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
+	 96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
+	  0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
+	245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
+	210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
+	 53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
+	 10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
+	236, 192, 108,  86
+};
+
+#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
+                          + ((u32)((h) * (mm)) << 16))
+
+#define W_SMALL(sb, o1, o2, mm) \
+	(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
+
+#define WS_0_0   W_SMALL( 4,    0,    1, 185)
+#define WS_0_1   W_SMALL( 6,    0,    1, 185)
+#define WS_0_2   W_SMALL( 0,    0,    1, 185)
+#define WS_0_3   W_SMALL( 2,    0,    1, 185)
+#define WS_0_4   W_SMALL( 7,    0,    1, 185)
+#define WS_0_5   W_SMALL( 5,    0,    1, 185)
+#define WS_0_6   W_SMALL( 3,    0,    1, 185)
+#define WS_0_7   W_SMALL( 1,    0,    1, 185)
+#define WS_1_0   W_SMALL(15,    0,    1, 185)
+#define WS_1_1   W_SMALL(11,    0,    1, 185)
+#define WS_1_2   W_SMALL(12,    0,    1, 185)
+#define WS_1_3   W_SMALL( 8,    0,    1, 185)
+#define WS_1_4   W_SMALL( 9,    0,    1, 185)
+#define WS_1_5   W_SMALL(13,    0,    1, 185)
+#define WS_1_6   W_SMALL(10,    0,    1, 185)
+#define WS_1_7   W_SMALL(14,    0,    1, 185)
+#define WS_2_0   W_SMALL(17, -128,  -64, 233)
+#define WS_2_1   W_SMALL(18, -128,  -64, 233)
+#define WS_2_2   W_SMALL(23, -128,  -64, 233)
+#define WS_2_3   W_SMALL(20, -128,  -64, 233)
+#define WS_2_4   W_SMALL(22, -128,  -64, 233)
+#define WS_2_5   W_SMALL(21, -128,  -64, 233)
+#define WS_2_6   W_SMALL(16, -128,  -64, 233)
+#define WS_2_7   W_SMALL(19, -128,  -64, 233)
+#define WS_3_0   W_SMALL(30, -191, -127, 233)
+#define WS_3_1   W_SMALL(24, -191, -127, 233)
+#define WS_3_2   W_SMALL(25, -191, -127, 233)
+#define WS_3_3   W_SMALL(31, -191, -127, 233)
+#define WS_3_4   W_SMALL(27, -191, -127, 233)
+#define WS_3_5   W_SMALL(29, -191, -127, 233)
+#define WS_3_6   W_SMALL(28, -191, -127, 233)
+#define WS_3_7   W_SMALL(26, -191, -127, 233)
+
+#define W_BIG(sb, o1, o2, mm) \
+	(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
+
+#define WB_0_0   W_BIG( 4,    0,    1, 185)
+#define WB_0_1   W_BIG( 6,    0,    1, 185)
+#define WB_0_2   W_BIG( 0,    0,    1, 185)
+#define WB_0_3   W_BIG( 2,    0,    1, 185)
+#define WB_0_4   W_BIG( 7,    0,    1, 185)
+#define WB_0_5   W_BIG( 5,    0,    1, 185)
+#define WB_0_6   W_BIG( 3,    0,    1, 185)
+#define WB_0_7   W_BIG( 1,    0,    1, 185)
+#define WB_1_0   W_BIG(15,    0,    1, 185)
+#define WB_1_1   W_BIG(11,    0,    1, 185)
+#define WB_1_2   W_BIG(12,    0,    1, 185)
+#define WB_1_3   W_BIG( 8,    0,    1, 185)
+#define WB_1_4   W_BIG( 9,    0,    1, 185)
+#define WB_1_5   W_BIG(13,    0,    1, 185)
+#define WB_1_6   W_BIG(10,    0,    1, 185)
+#define WB_1_7   W_BIG(14,    0,    1, 185)
+#define WB_2_0   W_BIG(17, -256, -128, 233)
+#define WB_2_1   W_BIG(18, -256, -128, 233)
+#define WB_2_2   W_BIG(23, -256, -128, 233)
+#define WB_2_3   W_BIG(20, -256, -128, 233)
+#define WB_2_4   W_BIG(22, -256, -128, 233)
+#define WB_2_5   W_BIG(21, -256, -128, 233)
+#define WB_2_6   W_BIG(16, -256, -128, 233)
+#define WB_2_7   W_BIG(19, -256, -128, 233)
+#define WB_3_0   W_BIG(30, -383, -255, 233)
+#define WB_3_1   W_BIG(24, -383, -255, 233)
+#define WB_3_2   W_BIG(25, -383, -255, 233)
+#define WB_3_3   W_BIG(31, -383, -255, 233)
+#define WB_3_4   W_BIG(27, -383, -255, 233)
+#define WB_3_5   W_BIG(29, -383, -255, 233)
+#define WB_3_6   W_BIG(28, -383, -255, 233)
+#define WB_3_7   W_BIG(26, -383, -255, 233)
+
+#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))
+
+#define PP4_0_0   1
+#define PP4_0_1   0
+#define PP4_0_2   3
+#define PP4_0_3   2
+#define PP4_1_0   2
+#define PP4_1_1   3
+#define PP4_1_2   0
+#define PP4_1_3   1
+#define PP4_2_0   3
+#define PP4_2_1   2
+#define PP4_2_2   1
+#define PP4_2_3   0
+
+#define PP8_0_0   1
+#define PP8_0_1   0
+#define PP8_0_2   3
+#define PP8_0_3   2
+#define PP8_0_4   5
+#define PP8_0_5   4
+#define PP8_0_6   7
+#define PP8_0_7   6
+
+#define PP8_1_0   6
+#define PP8_1_1   7
+#define PP8_1_2   4
+#define PP8_1_3   5
+#define PP8_1_4   2
+#define PP8_1_5   3
+#define PP8_1_6   0
+#define PP8_1_7   1
+
+#define PP8_2_0   2
+#define PP8_2_1   3
+#define PP8_2_2   0
+#define PP8_2_3   1
+#define PP8_2_4   6
+#define PP8_2_5   7
+#define PP8_2_6   4
+#define PP8_2_7   5
+
+#define PP8_3_0   3
+#define PP8_3_1   2
+#define PP8_3_2   1
+#define PP8_3_3   0
+#define PP8_3_4   7
+#define PP8_3_5   6
+#define PP8_3_6   5
+#define PP8_3_7   4
+
+#define PP8_4_0   5
+#define PP8_4_1   4
+#define PP8_4_2   7
+#define PP8_4_3   6
+#define PP8_4_4   1
+#define PP8_4_5   0
+#define PP8_4_6   3
+#define PP8_4_7   2
+
+#define PP8_5_0   7
+#define PP8_5_1   6
+#define PP8_5_2   5
+#define PP8_5_3   4
+#define PP8_5_4   3
+#define PP8_5_5   2
+#define PP8_5_6   1
+#define PP8_5_7   0
+
+#define PP8_6_0   4
+#define PP8_6_1   5
+#define PP8_6_2   6
+#define PP8_6_3   7
+#define PP8_6_4   0
+#define PP8_6_5   1
+#define PP8_6_6   2
+#define PP8_6_7   3
+
+#if SPH_SIMD_NOCOPY
+
+#define DECL_STATE_SMALL
+#define READ_STATE_SMALL(sc)
+#define WRITE_STATE_SMALL(sc)
+#define DECL_STATE_BIG
+#define READ_STATE_BIG(sc)
+#define WRITE_STATE_BIG(sc)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
+
+#define READ_STATE_SMALL(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		B0 = (sc)->state[ 4]; \
+		B1 = (sc)->state[ 5]; \
+		B2 = (sc)->state[ 6]; \
+		B3 = (sc)->state[ 7]; \
+		C0 = (sc)->state[ 8]; \
+		C1 = (sc)->state[ 9]; \
+		C2 = (sc)->state[10]; \
+		C3 = (sc)->state[11]; \
+		D0 = (sc)->state[12]; \
+		D1 = (sc)->state[13]; \
+		D2 = (sc)->state[14]; \
+		D3 = (sc)->state[15]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = B0; \
+		(sc)->state[ 5] = B1; \
+		(sc)->state[ 6] = B2; \
+		(sc)->state[ 7] = B3; \
+		(sc)->state[ 8] = C0; \
+		(sc)->state[ 9] = C1; \
+		(sc)->state[10] = C2; \
+		(sc)->state[11] = C3; \
+		(sc)->state[12] = D0; \
+		(sc)->state[13] = D1; \
+		(sc)->state[14] = D2; \
+		(sc)->state[15] = D3; \
+	} while (0)
+
+#define DECL_STATE_BIG   \
+	u32 A0, A1, A2, A3, A4, A5, A6, A7; \
+	u32 B0, B1, B2, B3, B4, B5, B6, B7; \
+	u32 C0, C1, C2, C3, C4, C5, C6, C7; \
+	u32 D0, D1, D2, D3, D4, D5, D6, D7;
+
+#define READ_STATE_BIG(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		A4 = (sc)->state[ 4]; \
+		A5 = (sc)->state[ 5]; \
+		A6 = (sc)->state[ 6]; \
+		A7 = (sc)->state[ 7]; \
+		B0 = (sc)->state[ 8]; \
+		B1 = (sc)->state[ 9]; \
+		B2 = (sc)->state[10]; \
+		B3 = (sc)->state[11]; \
+		B4 = (sc)->state[12]; \
+		B5 = (sc)->state[13]; \
+		B6 = (sc)->state[14]; \
+		B7 = (sc)->state[15]; \
+		C0 = (sc)->state[16]; \
+		C1 = (sc)->state[17]; \
+		C2 = (sc)->state[18]; \
+		C3 = (sc)->state[19]; \
+		C4 = (sc)->state[20]; \
+		C5 = (sc)->state[21]; \
+		C6 = (sc)->state[22]; \
+		C7 = (sc)->state[23]; \
+		D0 = (sc)->state[24]; \
+		D1 = (sc)->state[25]; \
+		D2 = (sc)->state[26]; \
+		D3 = (sc)->state[27]; \
+		D4 = (sc)->state[28]; \
+		D5 = (sc)->state[29]; \
+		D6 = (sc)->state[30]; \
+		D7 = (sc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = A4; \
+		(sc)->state[ 5] = A5; \
+		(sc)->state[ 6] = A6; \
+		(sc)->state[ 7] = A7; \
+		(sc)->state[ 8] = B0; \
+		(sc)->state[ 9] = B1; \
+		(sc)->state[10] = B2; \
+		(sc)->state[11] = B3; \
+		(sc)->state[12] = B4; \
+		(sc)->state[13] = B5; \
+		(sc)->state[14] = B6; \
+		(sc)->state[15] = B7; \
+		(sc)->state[16] = C0; \
+		(sc)->state[17] = C1; \
+		(sc)->state[18] = C2; \
+		(sc)->state[19] = C3; \
+		(sc)->state[20] = C4; \
+		(sc)->state[21] = C5; \
+		(sc)->state[22] = C6; \
+		(sc)->state[23] = C7; \
+		(sc)->state[24] = D0; \
+		(sc)->state[25] = D1; \
+		(sc)->state[26] = D2; \
+		(sc)->state[27] = D3; \
+		(sc)->state[28] = D4; \
+		(sc)->state[29] = D5; \
+		(sc)->state[30] = D6; \
+		(sc)->state[31] = D7; \
+	} while (0)
+
+#endif
+
+#define STEP_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA ## n; \
+	} while (0)
+
+#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		STEP_ELT(0, w0, fun, s, pp4b); \
+		STEP_ELT(1, w1, fun, s, pp4b); \
+		STEP_ELT(2, w2, fun, s, pp4b); \
+		STEP_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		u32 tA4 = ROL32(A4, r); \
+		u32 tA5 = ROL32(A5, r); \
+		u32 tA6 = ROL32(A6, r); \
+		u32 tA7 = ROL32(A7, r); \
+		STEP_ELT(0, w0, fun, s, pp8b); \
+		STEP_ELT(1, w1, fun, s, pp8b); \
+		STEP_ELT(2, w2, fun, s, pp8b); \
+		STEP_ELT(3, w3, fun, s, pp8b); \
+		STEP_ELT(4, w4, fun, s, pp8b); \
+		STEP_ELT(5, w5, fun, s, pp8b); \
+		STEP_ELT(6, w6, fun, s, pp8b); \
+		STEP_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+#define M3_0_0   0_
+#define M3_1_0   1_
+#define M3_2_0   2_
+#define M3_3_0   0_
+#define M3_4_0   1_
+#define M3_5_0   2_
+#define M3_6_0   0_
+#define M3_7_0   1_
+
+#define M3_0_1   1_
+#define M3_1_1   2_
+#define M3_2_1   0_
+#define M3_3_1   1_
+#define M3_4_1   2_
+#define M3_5_1   0_
+#define M3_6_1   1_
+#define M3_7_1   2_
+
+#define M3_0_2   2_
+#define M3_1_2   0_
+#define M3_2_2   1_
+#define M3_3_2   2_
+#define M3_4_2   0_
+#define M3_5_2   1_
+#define M3_6_2   2_
+#define M3_7_2   0_
+
+#define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)
+
+#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_SMALL_(WS_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
+	} while (0)
+
+#define M7_0_0   0_
+#define M7_1_0   1_
+#define M7_2_0   2_
+#define M7_3_0   3_
+#define M7_4_0   4_
+#define M7_5_0   5_
+#define M7_6_0   6_
+#define M7_7_0   0_
+
+#define M7_0_1   1_
+#define M7_1_1   2_
+#define M7_2_1   3_
+#define M7_3_1   4_
+#define M7_4_1   5_
+#define M7_5_1   6_
+#define M7_6_1   0_
+#define M7_7_1   1_
+
+#define M7_0_2   2_
+#define M7_1_2   3_
+#define M7_2_2   4_
+#define M7_3_2   5_
+#define M7_4_2   6_
+#define M7_5_2   0_
+#define M7_6_2   1_
+#define M7_7_2   2_
+
+#define M7_0_3   3_
+#define M7_1_3   4_
+#define M7_2_3   5_
+#define M7_3_3   6_
+#define M7_4_3   0_
+#define M7_5_3   1_
+#define M7_6_3   2_
+#define M7_7_3   3_
+
+#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)
+
+#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_BIG_(WB_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define B0   state[ 4]
+#define B1   state[ 5]
+#define B2   state[ 6]
+#define B3   state[ 7]
+#define C0   state[ 8]
+#define C1   state[ 9]
+#define C2   state[10]
+#define C3   state[11]
+#define D0   state[12]
+#define D1   state[13]
+#define D2   state[14]
+#define D3   state[15]
+
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+
+#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA[4]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		STEP2_ELT(0, w0, fun, s, pp4b); \
+		STEP2_ELT(1, w1, fun, s, pp4b); \
+		STEP2_ELT(2, w2, fun, s, pp4b); \
+		STEP2_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+static void
+one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
+
+	STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
+	STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
+	STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
+	STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
+	STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
+	STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
+	STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
+	STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
+}
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	u32 w[32];
+	u32 state[16];
+	size_t u;
+
+	static const size_t wsp[32] = {
+		 4 << 3,  6 << 3,  0 << 3,  2 << 3,
+		 7 << 3,  5 << 3,  3 << 3,  1 << 3,
+		15 << 3, 11 << 3, 12 << 3,  8 << 3,
+		 9 << 3, 13 << 3, 10 << 3, 14 << 3,
+		17 << 3, 18 << 3, 23 << 3, 20 << 3,
+		22 << 3, 21 << 3, 16 << 3, 19 << 3,
+		30 << 3, 24 << 3, 25 << 3, 31 << 3,
+		27 << 3, 29 << 3, 28 << 3, 26 << 3
+	};
+
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 16; i += 4) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+	}
+
+#define WSREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 32; u += 4) { \
+			size_t v = wsp[(u >> 2) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WSREAD( 0,    0,    1, 185);
+	one_round_small(state, w, 0,  3, 23, 17, 27);
+	WSREAD( 8,    0,    1, 185);
+	one_round_small(state, w, 2, 28, 19, 22,  7);
+	WSREAD(16, -128,  -64, 233);
+	one_round_small(state, w, 1, 29,  9, 15,  5);
+	WSREAD(24, -191, -127, 233);
+	one_round_small(state, w, 0,  4, 13, 10, 25);
+
+#undef WSREAD
+
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define B0   (sc->state[ 4])
+#define B1   (sc->state[ 5])
+#define B2   (sc->state[ 6])
+#define B3   (sc->state[ 7])
+#define C0   (sc->state[ 8])
+#define C1   (sc->state[ 9])
+#define C2   (sc->state[10])
+#define C3   (sc->state[11])
+#define D0   (sc->state[12])
+#define D1   (sc->state[13])
+#define D2   (sc->state[14])
+#define D3   (sc->state[15])
+#endif
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	DECL_STATE_SMALL
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[16];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_SMALL(sc);
+	A0 ^= sph_dec32le_aligned(x +  0);
+	A1 ^= sph_dec32le_aligned(x +  4);
+	A2 ^= sph_dec32le_aligned(x +  8);
+	A3 ^= sph_dec32le_aligned(x + 12);
+	B0 ^= sph_dec32le_aligned(x + 16);
+	B1 ^= sph_dec32le_aligned(x + 20);
+	B2 ^= sph_dec32le_aligned(x + 24);
+	B3 ^= sph_dec32le_aligned(x + 28);
+	C0 ^= sph_dec32le_aligned(x + 32);
+	C1 ^= sph_dec32le_aligned(x + 36);
+	C2 ^= sph_dec32le_aligned(x + 40);
+	C3 ^= sph_dec32le_aligned(x + 44);
+	D0 ^= sph_dec32le_aligned(x + 48);
+	D1 ^= sph_dec32le_aligned(x + 52);
+	D2 ^= sph_dec32le_aligned(x + 56);
+	D3 ^= sph_dec32le_aligned(x + 60);
+	ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
+	ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
+	ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
+		IF, 25,  4, PP4_2_);
+#else
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+	WRITE_STATE_SMALL(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#endif
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define A4   state[ 4]
+#define A5   state[ 5]
+#define A6   state[ 6]
+#define A7   state[ 7]
+#define B0   state[ 8]
+#define B1   state[ 9]
+#define B2   state[10]
+#define B3   state[11]
+#define B4   state[12]
+#define B5   state[13]
+#define B6   state[14]
+#define B7   state[15]
+#define C0   state[16]
+#define C1   state[17]
+#define C2   state[18]
+#define C3   state[19]
+#define C4   state[20]
+#define C5   state[21]
+#define C6   state[22]
+#define C7   state[23]
+#define D0   state[24]
+#define D1   state[25]
+#define D2   state[26]
+#define D3   state[27]
+#define D4   state[28]
+#define D5   state[29]
+#define D6   state[30]
+#define D7   state[31]
+
+/*
+ * Not needed -- already defined for SIMD-224 / SIMD-256
+ *
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+ */
+
+#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA[8]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		tA[4] = ROL32(A4, r); \
+		tA[5] = ROL32(A5, r); \
+		tA[6] = ROL32(A6, r); \
+		tA[7] = ROL32(A7, r); \
+		STEP2_ELT(0, w0, fun, s, pp8b); \
+		STEP2_ELT(1, w1, fun, s, pp8b); \
+		STEP2_ELT(2, w2, fun, s, pp8b); \
+		STEP2_ELT(3, w3, fun, s, pp8b); \
+		STEP2_ELT(4, w4, fun, s, pp8b); \
+		STEP2_ELT(5, w5, fun, s, pp8b); \
+		STEP2_ELT(6, w6, fun, s, pp8b); \
+		STEP2_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+static void
+one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
+
+	STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
+		IF,  p0, p1, pp8k[isp + 0]);
+	STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
+		IF,  p1, p2, pp8k[isp + 1]);
+	STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
+		IF,  p2, p3, pp8k[isp + 2]);
+	STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
+		IF,  p3, p0, pp8k[isp + 3]);
+	STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
+		MAJ, p0, p1, pp8k[isp + 4]);
+	STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
+		MAJ, p1, p2, pp8k[isp + 5]);
+	STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
+		MAJ, p2, p3, pp8k[isp + 6]);
+	STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
+		MAJ, p3, p0, pp8k[isp + 7]);
+}
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	u32 w[64];
+	u32 state[32];
+	size_t u;
+
+	static const size_t wbp[32] = {
+		 4 << 4,  6 << 4,  0 << 4,  2 << 4,
+		 7 << 4,  5 << 4,  3 << 4,  1 << 4,
+		15 << 4, 11 << 4, 12 << 4,  8 << 4,
+		 9 << 4, 13 << 4, 10 << 4, 14 << 4,
+		17 << 4, 18 << 4, 23 << 4, 20 << 4,
+		22 << 4, 21 << 4, 16 << 4, 19 << 4,
+		30 << 4, 24 << 4, 25 << 4, 31 << 4,
+		27 << 4, 29 << 4, 28 << 4, 26 << 4
+	};
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 32; i += 8) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+		state[i + 4] = sc->state[i + 4]
+			^ sph_dec32le_aligned(x + 4 * (i + 4));
+		state[i + 5] = sc->state[i + 5]
+			^ sph_dec32le_aligned(x + 4 * (i + 5));
+		state[i + 6] = sc->state[i + 6]
+			^ sph_dec32le_aligned(x + 4 * (i + 6));
+		state[i + 7] = sc->state[i + 7]
+			^ sph_dec32le_aligned(x + 4 * (i + 7));
+	}
+
+#define WBREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 64; u += 8) { \
+			size_t v = wbp[(u >> 3) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+			w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
+				q[v + 2 * 4 + (o2)], mm); \
+			w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
+				q[v + 2 * 5 + (o2)], mm); \
+			w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
+				q[v + 2 * 6 + (o2)], mm); \
+			w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
+				q[v + 2 * 7 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WBREAD( 0,    0,    1, 185);
+	one_round_big(state, w, 0,  3, 23, 17, 27);
+	WBREAD( 8,    0,    1, 185);
+	one_round_big(state, w, 1, 28, 19, 22,  7);
+	WBREAD(16, -256, -128, 233);
+	one_round_big(state, w, 2, 29,  9, 15,  5);
+	WBREAD(24, -383, -255, 233);
+	one_round_big(state, w, 3,  4, 13, 10, 25);
+
+#undef WBREAD
+
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define A4   (sc->state[ 4])
+#define A5   (sc->state[ 5])
+#define A6   (sc->state[ 6])
+#define A7   (sc->state[ 7])
+#define B0   (sc->state[ 8])
+#define B1   (sc->state[ 9])
+#define B2   (sc->state[10])
+#define B3   (sc->state[11])
+#define B4   (sc->state[12])
+#define B5   (sc->state[13])
+#define B6   (sc->state[14])
+#define B7   (sc->state[15])
+#define C0   (sc->state[16])
+#define C1   (sc->state[17])
+#define C2   (sc->state[18])
+#define C3   (sc->state[19])
+#define C4   (sc->state[20])
+#define C5   (sc->state[21])
+#define C6   (sc->state[22])
+#define C7   (sc->state[23])
+#define D0   (sc->state[24])
+#define D1   (sc->state[25])
+#define D2   (sc->state[26])
+#define D3   (sc->state[27])
+#define D4   (sc->state[28])
+#define D5   (sc->state[29])
+#define D6   (sc->state[30])
+#define D7   (sc->state[31])
+#endif
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	DECL_STATE_BIG
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[32];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_BIG(sc);
+	A0 ^= sph_dec32le_aligned(x +   0);
+	A1 ^= sph_dec32le_aligned(x +   4);
+	A2 ^= sph_dec32le_aligned(x +   8);
+	A3 ^= sph_dec32le_aligned(x +  12);
+	A4 ^= sph_dec32le_aligned(x +  16);
+	A5 ^= sph_dec32le_aligned(x +  20);
+	A6 ^= sph_dec32le_aligned(x +  24);
+	A7 ^= sph_dec32le_aligned(x +  28);
+	B0 ^= sph_dec32le_aligned(x +  32);
+	B1 ^= sph_dec32le_aligned(x +  36);
+	B2 ^= sph_dec32le_aligned(x +  40);
+	B3 ^= sph_dec32le_aligned(x +  44);
+	B4 ^= sph_dec32le_aligned(x +  48);
+	B5 ^= sph_dec32le_aligned(x +  52);
+	B6 ^= sph_dec32le_aligned(x +  56);
+	B7 ^= sph_dec32le_aligned(x +  60);
+	C0 ^= sph_dec32le_aligned(x +  64);
+	C1 ^= sph_dec32le_aligned(x +  68);
+	C2 ^= sph_dec32le_aligned(x +  72);
+	C3 ^= sph_dec32le_aligned(x +  76);
+	C4 ^= sph_dec32le_aligned(x +  80);
+	C5 ^= sph_dec32le_aligned(x +  84);
+	C6 ^= sph_dec32le_aligned(x +  88);
+	C7 ^= sph_dec32le_aligned(x +  92);
+	D0 ^= sph_dec32le_aligned(x +  96);
+	D1 ^= sph_dec32le_aligned(x + 100);
+	D2 ^= sph_dec32le_aligned(x + 104);
+	D3 ^= sph_dec32le_aligned(x + 108);
+	D4 ^= sph_dec32le_aligned(x + 112);
+	D5 ^= sph_dec32le_aligned(x + 116);
+	D6 ^= sph_dec32le_aligned(x + 120);
+	D7 ^= sph_dec32le_aligned(x + 124);
+
+	ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
+	ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
+	ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_BIG(
+		saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		saved[ 8], saved[ 9], saved[10], saved[11],
+		saved[12], saved[13], saved[14], saved[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		saved[16], saved[17], saved[18], saved[19],
+		saved[20], saved[21], saved[22], saved[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		saved[24], saved[25], saved[26], saved[27],
+		saved[28], saved[29], saved[30], saved[31],
+		IF, 25,  4, PP8_0_);
+#else
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+	WRITE_STATE_BIG(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+#endif
+
+#endif
+
+static const u32 IV224[] = {
+	C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
+	C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
+	C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
+	C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
+};
+
+static const u32 IV256[] = {
+	C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
+	C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
+	C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
+	C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
+};
+
+static const u32 IV384[] = {
+	C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
+	C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
+	C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
+	C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
+	C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
+	C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
+	C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
+	C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
+};
+
+static const u32 IV512[] = {
+	C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+	C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+	C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+	C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+	C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+	C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+	C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+	C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
+};
+
+static void
+init_small(void *cc, const u32 *iv)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+init_big(void *cc, const u32 *iv)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+update_small(void *cc, const void *data, size_t len)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_small(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+update_big(void *cc, const void *data, size_t len)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_big(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+encode_count_small(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 9);
+	high = T32(high << 9) + (low >> 23);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+encode_count_big(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 10);
+	high = T32(high << 10) + (low >> 22);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_small_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_small(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_small(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+static void
+finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_big_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_big(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_big(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+void
+sph_simd224_init(void *cc)
+{
+	init_small(cc, IV224);
+}
+
+void
+sph_simd224(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd224_close(void *cc, void *dst)
+{
+	sph_simd224_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 7);
+	sph_simd224_init(cc);
+}
+
+void
+sph_simd256_init(void *cc)
+{
+	init_small(cc, IV256);
+}
+
+void
+sph_simd256(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd256_close(void *cc, void *dst)
+{
+	sph_simd256_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 8);
+	sph_simd256_init(cc);
+}
+
+void
+sph_simd384_init(void *cc)
+{
+	init_big(cc, IV384);
+}
+
+void
+sph_simd384(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd384_close(void *cc, void *dst)
+{
+	sph_simd384_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 12);
+	sph_simd384_init(cc);
+}
+
+void
+sph_simd512_init(void *cc)
+{
+	init_big(cc, IV512);
+}
+
+void
+sph_simd512(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd512_close(void *cc, void *dst)
+{
+	sph_simd512_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 16);
+	sph_simd512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sph/skein.c b/sph/skein.c
new file mode 100644
index 0000000..2fcfae5
--- /dev/null
+++ b/sph/skein.c
@@ -0,0 +1,1244 @@
+/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
+/*
+ * Skein implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_skein.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
+#define SPH_SMALL_FOOTPRINT_SKEIN   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_64
+
+#if 0
+/* obsolete */
+/*
+ * M5_ ## s ## _ ## i  evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3).
+ */
+
+#define M5_0_0    0
+#define M5_0_1    1
+#define M5_0_2    2
+#define M5_0_3    3
+
+#define M5_1_0    1
+#define M5_1_1    2
+#define M5_1_2    3
+#define M5_1_3    4
+
+#define M5_2_0    2
+#define M5_2_1    3
+#define M5_2_2    4
+#define M5_2_3    0
+
+#define M5_3_0    3
+#define M5_3_1    4
+#define M5_3_2    0
+#define M5_3_3    1
+
+#define M5_4_0    4
+#define M5_4_1    0
+#define M5_4_2    1
+#define M5_4_3    2
+
+#define M5_5_0    0
+#define M5_5_1    1
+#define M5_5_2    2
+#define M5_5_3    3
+
+#define M5_6_0    1
+#define M5_6_1    2
+#define M5_6_2    3
+#define M5_6_3    4
+
+#define M5_7_0    2
+#define M5_7_1    3
+#define M5_7_2    4
+#define M5_7_3    0
+
+#define M5_8_0    3
+#define M5_8_1    4
+#define M5_8_2    0
+#define M5_8_3    1
+
+#define M5_9_0    4
+#define M5_9_1    0
+#define M5_9_2    1
+#define M5_9_3    2
+
+#define M5_10_0   0
+#define M5_10_1   1
+#define M5_10_2   2
+#define M5_10_3   3
+
+#define M5_11_0   1
+#define M5_11_1   2
+#define M5_11_2   3
+#define M5_11_3   4
+
+#define M5_12_0   2
+#define M5_12_1   3
+#define M5_12_2   4
+#define M5_12_3   0
+
+#define M5_13_0   3
+#define M5_13_1   4
+#define M5_13_2   0
+#define M5_13_3   1
+
+#define M5_14_0   4
+#define M5_14_1   0
+#define M5_14_2   1
+#define M5_14_3   2
+
+#define M5_15_0   0
+#define M5_15_1   1
+#define M5_15_2   2
+#define M5_15_3   3
+
+#define M5_16_0   1
+#define M5_16_1   2
+#define M5_16_2   3
+#define M5_16_3   4
+
+#define M5_17_0   2
+#define M5_17_1   3
+#define M5_17_2   4
+#define M5_17_3   0
+
+#define M5_18_0   3
+#define M5_18_1   4
+#define M5_18_2   0
+#define M5_18_3   1
+#endif
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#if 0
+/* obsolete */
+#define SKSI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i))
+#define SKST(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+#endif
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#if 0
+/* obsolete */
+#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2)   do { \
+		k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+#endif
+
+#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2)   do { \
+		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+			^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKSI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \
+		w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \
+		w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_ADDKEY(s, tt0, tt1)   do { \
+		p0 = SPH_T64(p0 + h[s + 0]); \
+		p1 = SPH_T64(p1 + h[s + 1]); \
+		p2 = SPH_T64(p2 + h[s + 2]); \
+		p3 = SPH_T64(p3 + h[s + 3]); \
+		p4 = SPH_T64(p4 + h[s + 4]); \
+		p5 = SPH_T64(p5 + h[s + 5] + tt0); \
+		p6 = SPH_T64(p6 + h[s + 6] + tt1); \
+		p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \
+	} while (0)
+
+#else
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKBI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKBI(k, s, 1)); \
+		w2 = SPH_T64(w2 + SKBI(k, s, 2)); \
+		w3 = SPH_T64(w3 + SKBI(k, s, 3)); \
+		w4 = SPH_T64(w4 + SKBI(k, s, 4)); \
+		w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+#endif
+
+#define TFBIG_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1)  do { \
+		TFSMALL_MIX(w0, w1, rc0); \
+		TFSMALL_MIX(w2, w3, rc1); \
+	} while (0)
+#endif
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+		TFBIG_MIX(w0, w1, rc0); \
+		TFBIG_MIX(w2, w3, rc1); \
+		TFBIG_MIX(w4, w5, rc2); \
+		TFBIG_MIX(w6, w7, rc3); \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_4e(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \
+		TFSMALL_MIX4(p0, p3, p2, p1,  5, 37); \
+	} while (0)
+
+#define TFSMALL_4o(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(s, t0, t1); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(s, t1, t2); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#else
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define UBI_SMALL(etype, extra)  do { \
+		sph_u64 h4, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le(buf +  0); \
+		sph_u64 m1 = sph_dec64le(buf +  8); \
+		sph_u64 m2 = sph_dec64le(buf + 16); \
+		sph_u64 m3 = sph_dec64le(buf + 24); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \
+		t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \
+		TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \
+		TFSMALL_4e(0); \
+		TFSMALL_4o(1); \
+		TFSMALL_4e(2); \
+		TFSMALL_4o(3); \
+		TFSMALL_4e(4); \
+		TFSMALL_4o(5); \
+		TFSMALL_4e(6); \
+		TFSMALL_4o(7); \
+		TFSMALL_4e(8); \
+		TFSMALL_4o(9); \
+		TFSMALL_4e(10); \
+		TFSMALL_4o(11); \
+		TFSMALL_4e(12); \
+		TFSMALL_4o(13); \
+		TFSMALL_4e(14); \
+		TFSMALL_4o(15); \
+		TFSMALL_4e(16); \
+		TFSMALL_4o(17); \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 t0, t1, t2; \
+		unsigned u; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \
+			h[6], h[7], h[8], t0, t1, t2); \
+		for (u = 0; u <= 15; u += 3) { \
+			h[u +  9] = h[u + 0]; \
+			h[u + 10] = h[u + 1]; \
+			h[u + 11] = h[u + 2]; \
+		} \
+		for (u = 0; u < 9; u ++) { \
+			sph_u64 s = u << 1; \
+			sph_u64 tmp; \
+			TFBIG_4e(s); \
+			TFBIG_4o(s + 1); \
+			tmp = t2; \
+			t2 = t1; \
+			t1 = t0; \
+			t0 = tmp; \
+		} \
+		TFBIG_ADDKEY(18, t0, t1); \
+		h[0] = m0 ^ p0; \
+		h[1] = m1 ^ p1; \
+		h[2] = m2 ^ p2; \
+		h[3] = m3 ^ p3; \
+		h[4] = m4 ^ p4; \
+		h[5] = m5 ^ p5; \
+		h[6] = m6 ^ p6; \
+		h[7] = m7 ^ p7; \
+	} while (0)
+
+#else
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 h8, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+		TFBIG_4e(0); \
+		TFBIG_4o(1); \
+		TFBIG_4e(2); \
+		TFBIG_4o(3); \
+		TFBIG_4e(4); \
+		TFBIG_4o(5); \
+		TFBIG_4e(6); \
+		TFBIG_4o(7); \
+		TFBIG_4e(8); \
+		TFBIG_4o(9); \
+		TFBIG_4e(10); \
+		TFBIG_4o(11); \
+		TFBIG_4e(12); \
+		TFBIG_4o(13); \
+		TFBIG_4e(14); \
+		TFBIG_4o(15); \
+		TFBIG_4e(16); \
+		TFBIG_4o(17); \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+		h4 = m4 ^ p4; \
+		h5 = m5 ^ p5; \
+		h6 = m6 ^ p6; \
+		h7 = m7 ^ p7; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define DECL_STATE_SMALL \
+	sph_u64 h0, h1, h2, h3; \
+	sph_u64 bcount;
+
+#define READ_STATE_SMALL(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		sc->bcount = bcount; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define DECL_STATE_BIG \
+	sph_u64 h[27]; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h[0] = (sc)->h0; \
+		h[1] = (sc)->h1; \
+		h[2] = (sc)->h2; \
+		h[3] = (sc)->h3; \
+		h[4] = (sc)->h4; \
+		h[5] = (sc)->h5; \
+		h[6] = (sc)->h6; \
+		h[7] = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h[0]; \
+		(sc)->h1 = h[1]; \
+		(sc)->h2 = h[2]; \
+		(sc)->h3 = h[3]; \
+		(sc)->h4 = h[4]; \
+		(sc)->h5 = h[5]; \
+		(sc)->h6 = h[6]; \
+		(sc)->h7 = h[7]; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_BIG \
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		h4 = (sc)->h4; \
+		h5 = (sc)->h5; \
+		h6 = (sc)->h6; \
+		h7 = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		(sc)->h4 = h4; \
+		(sc)->h5 = h5; \
+		(sc)->h6 = h6; \
+		(sc)->h7 = h7; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+static void
+skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+#endif
+
+static void
+skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->h4 = iv[4];
+	sc->h5 = iv[5];
+	sc->h6 = iv[6];
+	sc->h7 = iv[7];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_core(sph_skein_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr, clen;
+	unsigned first;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	clen = (sizeof sc->buf) - ptr;
+	if (len <= clen) {
+		memcpy(buf + ptr, data, len);
+		sc->ptr = ptr + len;
+		return;
+	}
+	if (clen != 0) {
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		first = 0;
+		memcpy(buf, data, sizeof sc->buf);
+		data = (const unsigned char *)data + sizeof sc->buf;
+		len -= sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(buf, data, len);
+
+#else
+
+	/*
+	 * Unrolling the loop yields a slight performance boost, while
+	 * keeping the code size aorund 24 kB on 32-bit x86.
+	 */
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		buf = (unsigned char *)data;
+		bcount ++;
+		UBI_SMALL(96, 0);
+		if (len <= 2 * sizeof sc->buf) {
+			data = buf + sizeof sc->buf;
+			len -= sizeof sc->buf;
+			break;
+		}
+		buf += sizeof sc->buf;
+		data = buf + sizeof sc->buf;
+		first = 0;
+		len -= 2 * sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(sc->buf, data, len);
+
+#endif
+}
+#endif
+
+static void
+skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
+{
+	/*
+	 * The Skein "final bit" in the tweak is troublesome here,
+	 * because if the input has a length which is a multiple of the
+	 * block size (512 bits) then that bit must be set for the
+	 * final block, which is full of message bits (padding in
+	 * Skein can be reduced to no extra bit at all). However, this
+	 * function cannot know whether it processes the last chunks of
+	 * the message or not. Hence we may keep a full block of buffered
+	 * data (64 bytes).
+	 */
+	unsigned char *buf;
+	size_t ptr;
+	unsigned first;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len <= (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	first = (bcount == 0) << 7;
+	do {
+		size_t clen;
+
+		if (ptr == sizeof sc->buf) {
+			bcount ++;
+			UBI_BIG(96 + first, 0);
+			first = 0;
+			ptr = 0;
+		}
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	} while (len > 0);
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+	DECL_STATE_SMALL
+
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_small_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	READ_STATE_SMALL(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_SMALL(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	memcpy(dst, buf, out_len);
+}
+#endif
+
+static void
+skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+#if SPH_SMALL_FOOTPRINT_SKEIN
+	size_t u;
+#endif
+	DECL_STATE_BIG
+
+	/*
+	 * Add bit padding if necessary.
+	 */
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_big_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * At that point, if ptr == 0, then the message was empty;
+	 * otherwise, there is between 1 and 64 bytes (inclusive) which
+	 * are yet to be processed. Either way, we complete the buffer
+	 * to a full block with zeros (the Skein specification mandates
+	 * that an empty message is padded so that there is at least
+	 * one block to process).
+	 *
+	 * Once this block has been processed, we do it again, with
+	 * a block full of zeros, for the output (that block contains
+	 * the encoding of "0", over 8 bytes, then padded with zeros).
+	 */
+	READ_STATE_BIG(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_BIG(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	/*
+	 * We use a temporary buffer because we must support the case
+	 * where output size is not a multiple of 64 (namely, a 224-bit
+	 * output).
+	 */
+	for (u = 0; u < out_len; u += 8)
+		sph_enc64le_aligned(buf + u, h[u >> 3]);
+	memcpy(dst, buf, out_len);
+
+#else
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	sph_enc64le_aligned(buf + 32, h4);
+	sph_enc64le_aligned(buf + 40, h5);
+	sph_enc64le_aligned(buf + 48, h6);
+	sph_enc64le_aligned(buf + 56, h7);
+	memcpy(dst, buf, out_len);
+
+#endif
+}
+
+#if 0
+/* obsolete */
+static const sph_u64 IV224[] = {
+	SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C),
+	SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833),
+	SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69)
+};
+#endif
+
+static const sph_u64 IV224[] = {
+	SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF),
+	SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4),
+	SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D),
+	SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4),
+	SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA),
+	SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407),
+	SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+
+#if 0
+/* obsolete */
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_small_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_small_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+#endif
+
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_big_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_big_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_init(void *cc)
+{
+	skein_big_init(cc, IV384);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_close(void *cc, void *dst)
+{
+	sph_skein384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 48);
+	sph_skein384_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_init(void *cc)
+{
+	skein_big_init(cc, IV512);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_close(void *cc, void *dst)
+{
+	sph_skein512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 64);
+	sph_skein512_init(cc);
+}
+
+#endif
diff --git a/sph_blake.h b/sph/sph_blake.h
similarity index 100%
rename from sph_blake.h
rename to sph/sph_blake.h
diff --git a/sph/sph_bmw.h b/sph/sph_bmw.h
new file mode 100644
index 0000000..484a2a7
--- /dev/null
+++ b/sph/sph_bmw.h
@@ -0,0 +1,320 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_bmw_small_context;
+
+/**
+ * This structure is a context for BMW-224 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw224_context;
+
+/**
+ * This structure is a context for BMW-256 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} sph_bmw_big_context;
+
+/**
+ * This structure is a context for BMW-384 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw384_context;
+
+/**
+ * This structure is a context for BMW-512 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw512_context;
+
+#endif
+
+/**
+ * Initialize a BMW-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-224 context (pointer to a
+ *             <code>sph_bmw224_context</code>)
+ */
+void sph_bmw224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-256 context (pointer to a
+ *             <code>sph_bmw256_context</code>)
+ */
+void sph_bmw256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BMW-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-384 context (pointer to a
+ *             <code>sph_bmw384_context</code>)
+ */
+void sph_bmw384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-512 context (pointer to a
+ *             <code>sph_bmw512_context</code>)
+ */
+void sph_bmw512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#endif
diff --git a/sph/sph_cubehash.h b/sph/sph_cubehash.h
new file mode 100644
index 0000000..487a194
--- /dev/null
+++ b/sph/sph_cubehash.h
@@ -0,0 +1,292 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph/sph_echo.h b/sph/sph_echo.h
new file mode 100644
index 0000000..1ae1e3d
--- /dev/null
+++ b/sph/sph_echo.h
@@ -0,0 +1,320 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph_fugue.h b/sph/sph_fugue.h
similarity index 100%
rename from sph_fugue.h
rename to sph/sph_fugue.h
diff --git a/sph_groestl.h b/sph/sph_groestl.h
similarity index 100%
rename from sph_groestl.h
rename to sph/sph_groestl.h
diff --git a/sph/sph_jh.h b/sph/sph_jh.h
new file mode 100644
index 0000000..0268406
--- /dev/null
+++ b/sph/sph_jh.h
@@ -0,0 +1,290 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_JH_H__
+#define SPH_JH_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for JH-224.
+ */
+#define SPH_SIZE_jh224   224
+
+/**
+ * Output size (in bits) for JH-256.
+ */
+#define SPH_SIZE_jh256   256
+
+/**
+ * Output size (in bits) for JH-384.
+ */
+#define SPH_SIZE_jh384   384
+
+/**
+ * Output size (in bits) for JH-512.
+ */
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} H;
+#if SPH_64
+	sph_u64 block_count;
+#else
+	sph_u32 block_count_high, block_count_low;
+#endif
+#endif
+} sph_jh_context;
+
+/**
+ * Type for a JH-224 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh224_context;
+
+/**
+ * Type for a JH-256 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh256_context;
+
+/**
+ * Type for a JH-384 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh384_context;
+
+/**
+ * Type for a JH-512 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh512_context;
+
+/**
+ * Initialize a JH-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-224 context (pointer to a
+ *             <code>sph_jh224_context</code>)
+ */
+void sph_jh224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param dst   the destination buffer
+ */
+void sph_jh224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-256 context (pointer to a
+ *             <code>sph_jh256_context</code>)
+ */
+void sph_jh256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param dst   the destination buffer
+ */
+void sph_jh256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-384 context (pointer to a
+ *             <code>sph_jh384_context</code>)
+ */
+void sph_jh384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param dst   the destination buffer
+ */
+void sph_jh384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-512 context (pointer to a
+ *             <code>sph_jh512_context</code>)
+ */
+void sph_jh512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param dst   the destination buffer
+ */
+void sph_jh512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph_keccak.h b/sph/sph_keccak.h
similarity index 100%
rename from sph_keccak.h
rename to sph/sph_keccak.h
diff --git a/sph/sph_luffa.h b/sph/sph_luffa.h
new file mode 100644
index 0000000..a32fd7b
--- /dev/null
+++ b/sph/sph_luffa.h
@@ -0,0 +1,296 @@
+/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * Luffa interface. Luffa is a family of functions which differ by
+ * their output size; this implementation defines Luffa for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_luffa.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_LUFFA_H__
+#define SPH_LUFFA_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Luffa-224.
+ */
+#define SPH_SIZE_luffa224   224
+
+/**
+ * Output size (in bits) for Luffa-256.
+ */
+#define SPH_SIZE_luffa256   256
+
+/**
+ * Output size (in bits) for Luffa-384.
+ */
+#define SPH_SIZE_luffa384   384
+
+/**
+ * Output size (in bits) for Luffa-512.
+ */
+#define SPH_SIZE_luffa512   512
+
+/**
+ * This structure is a context for Luffa-224 computations: it contains
+ * the intermediate values and some data from the last entered block.
+ * Once a Luffa computation has been performed, the context can be
+ * reused for another computation.
+ *
+ * The contents of this structure are private. A running Luffa
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_luffa224_context;
+
+/**
+ * This structure is a context for Luffa-256 computations. It is
+ * identical to <code>sph_luffa224_context</code>.
+ */
+typedef sph_luffa224_context sph_luffa256_context;
+
+/**
+ * This structure is a context for Luffa-384 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[4][8];
+#endif
+} sph_luffa384_context;
+
+/**
+ * This structure is a context for Luffa-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_luffa512_context;
+
+/**
+ * Initialize a Luffa-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-224 context (pointer to a
+ *             <code>sph_luffa224_context</code>)
+ */
+void sph_luffa224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-256 context (pointer to a
+ *             <code>sph_luffa256_context</code>)
+ */
+void sph_luffa256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-384 context (pointer to a
+ *             <code>sph_luffa384_context</code>)
+ */
+void sph_luffa384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-512 context (pointer to a
+ *             <code>sph_luffa512_context</code>)
+ */
+void sph_luffa512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+	
+#endif
diff --git a/sph/sph_shavite.h b/sph/sph_shavite.h
new file mode 100644
index 0000000..0957e42
--- /dev/null
+++ b/sph/sph_shavite.h
@@ -0,0 +1,314 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[8];
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[16];
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-512 context (pointer to a
+ *             <code>sph_shavite512_context</code>)
+ */
+void sph_shavite512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
diff --git a/sph/sph_simd.h b/sph/sph_simd.h
new file mode 100644
index 0000000..92ee1e7
--- /dev/null
+++ b/sph/sph_simd.h
@@ -0,0 +1,309 @@
+/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * SIMD interface. SIMD is a family of functions which differ by
+ * their output size; this implementation defines SIMD for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_simd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SIMD_H__
+#define SPH_SIMD_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SIMD-224.
+ */
+#define SPH_SIZE_simd224   224
+
+/**
+ * Output size (in bits) for SIMD-256.
+ */
+#define SPH_SIZE_simd256   256
+
+/**
+ * Output size (in bits) for SIMD-384.
+ */
+#define SPH_SIZE_simd384   384
+
+/**
+ * Output size (in bits) for SIMD-512.
+ */
+#define SPH_SIZE_simd512   512
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-224
+ * and SIMD-256.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[16];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_small_context;
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-384
+ * and SIMD-512.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_big_context;
+
+/**
+ * Type for a SIMD-224 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd224_context;
+
+/**
+ * Type for a SIMD-256 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd256_context;
+
+/**
+ * Type for a SIMD-384 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd384_context;
+
+/**
+ * Type for a SIMD-512 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd512_context;
+
+/**
+ * Initialize an SIMD-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-224 context (pointer to a
+ *             <code>sph_simd224_context</code>)
+ */
+void sph_simd224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param dst   the destination buffer
+ */
+void sph_simd224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-256 context (pointer to a
+ *             <code>sph_simd256_context</code>)
+ */
+void sph_simd256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param dst   the destination buffer
+ */
+void sph_simd256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-384 context (pointer to a
+ *             <code>sph_simd384_context</code>)
+ */
+void sph_simd384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param dst   the destination buffer
+ */
+void sph_simd384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-512 context (pointer to a
+ *             <code>sph_simd512_context</code>)
+ */
+void sph_simd512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param dst   the destination buffer
+ */
+void sph_simd512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph/sph_skein.h b/sph/sph_skein.h
new file mode 100644
index 0000000..8555984
--- /dev/null
+++ b/sph/sph_skein.h
@@ -0,0 +1,290 @@
+/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
+/**
+ * Skein interface. The Skein specification defines three main
+ * functions, called Skein-256, Skein-512 and Skein-1024, which can be
+ * further parameterized with an output length. For the SHA-3
+ * competition, Skein-512 is used for output sizes of 224, 256, 384 and
+ * 512 bits; this is what this code implements. Thus, we hereafter call
+ * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
+ * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
+ * and Skein-512-512, respectively.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_skein.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SKEIN_H__
+#define SPH_SKEIN_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for Skein-224.
+ */
+#define SPH_SIZE_skein224   224
+
+/**
+ * Output size (in bits) for Skein-256.
+ */
+#define SPH_SIZE_skein256   256
+
+/**
+ * Output size (in bits) for Skein-384.
+ */
+#define SPH_SIZE_skein384   384
+
+/**
+ * Output size (in bits) for Skein-512.
+ */
+#define SPH_SIZE_skein512   512
+
+/**
+ * This structure is a context for Skein computations (with a 384- or
+ * 512-bit output): it contains the intermediate values and some data
+ * from the last entered block. Once a Skein computation has been
+ * performed, the context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Skein computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+	sph_u64 bcount;
+#endif
+} sph_skein_big_context;
+
+/**
+ * Type for a Skein-224 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein224_context;
+
+/**
+ * Type for a Skein-256 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein256_context;
+
+/**
+ * Type for a Skein-384 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein384_context;
+
+/**
+ * Type for a Skein-512 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein512_context;
+
+/**
+ * Initialize a Skein-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-224 context (pointer to a
+ *             <code>sph_skein224_context</code>)
+ */
+void sph_skein224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param dst   the destination buffer
+ */
+void sph_skein224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-256 context (pointer to a
+ *             <code>sph_skein256_context</code>)
+ */
+void sph_skein256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param dst   the destination buffer
+ */
+void sph_skein256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-384 context (pointer to a
+ *             <code>sph_skein384_context</code>)
+ */
+void sph_skein384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param dst   the destination buffer
+ */
+void sph_skein384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-512 context (pointer to a
+ *             <code>sph_skein512_context</code>)
+ */
+void sph_skein512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param dst   the destination buffer
+ */
+void sph_skein512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#endif
diff --git a/sph_types.h b/sph/sph_types.h
similarity index 100%
rename from sph_types.h
rename to sph/sph_types.h