diff --git a/sph/Makefile.am b/sph/Makefile.am
new file mode 100644
index 00000000..3981e5da
--- /dev/null
+++ b/sph/Makefile.am
@@ -0,0 +1,3 @@
+noinst_LIBRARIES	= libsph.a
+
+libsph_a_SOURCES	= bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c
diff --git a/sph/aes_helper.c b/sph/aes_helper.c
new file mode 100644
index 00000000..872c0ab6
--- /dev/null
+++ b/sph/aes_helper.c
@@ -0,0 +1,386 @@
+/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
+/*
+ * AES tables. This file is not meant to be compiled by itself; it
+ * is included by some hash function implementations. It contains
+ * the precomputed tables and helper macros for evaluating an AES
+ * round, optionally with a final XOR with a subkey.
+ *
+ * By default, this file defines the tables and macros for little-endian
+ * processing (i.e. it is assumed that the input bytes have been read
+ * from memory and assembled with the little-endian convention). If
+ * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
+ * when this file is included, then the tables and macros for big-endian
+ * processing are defined instead. The big-endian tables and macros have
+ * names distinct from the little-endian tables and macros, hence it is
+ * possible to have both simultaneously, by including this file twice
+ * (with and without the AES_BIG_ENDIAN macro).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include "sph_types.h"
+
+#if AES_BIG_ENDIAN
+
+#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
+                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define AES0      AES0_BE
+#define AES1      AES1_BE
+#define AES2      AES2_BE
+#define AES3      AES3_BE
+
+#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[((X0) >> 24) & 0xFF] \
+			^ AES1[((X1) >> 16) & 0xFF] \
+			^ AES2[((X2) >> 8) & 0xFF] \
+			^ AES3[(X3) & 0xFF] ^ (K0); \
+		(Y1) = AES0[((X1) >> 24) & 0xFF] \
+			^ AES1[((X2) >> 16) & 0xFF] \
+			^ AES2[((X3) >> 8) & 0xFF] \
+			^ AES3[(X0) & 0xFF] ^ (K1); \
+		(Y2) = AES0[((X2) >> 24) & 0xFF] \
+			^ AES1[((X3) >> 16) & 0xFF] \
+			^ AES2[((X0) >> 8) & 0xFF] \
+			^ AES3[(X1) & 0xFF] ^ (K2); \
+		(Y3) = AES0[((X3) >> 24) & 0xFF] \
+			^ AES1[((X0) >> 16) & 0xFF] \
+			^ AES2[((X1) >> 8) & 0xFF] \
+			^ AES3[(X2) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#else
+
+#define AESx(x)   SPH_C32(x)
+#define AES0      AES0_LE
+#define AES1      AES1_LE
+#define AES2      AES2_LE
+#define AES3      AES3_LE
+
+#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[(X0) & 0xFF] \
+			^ AES1[((X1) >> 8) & 0xFF] \
+			^ AES2[((X2) >> 16) & 0xFF] \
+			^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
+		(Y1) = AES0[(X1) & 0xFF] \
+			^ AES1[((X2) >> 8) & 0xFF] \
+			^ AES2[((X3) >> 16) & 0xFF] \
+			^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
+		(Y2) = AES0[(X2) & 0xFF] \
+			^ AES1[((X3) >> 8) & 0xFF] \
+			^ AES2[((X0) >> 16) & 0xFF] \
+			^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
+		(Y3) = AES0[(X3) & 0xFF] \
+			^ AES1[((X0) >> 8) & 0xFF] \
+			^ AES2[((X1) >> 16) & 0xFF] \
+			^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#endif
+
+/*
+ * The AES*[] tables allow us to perform a fast evaluation of an AES
+ * round; table AESi[] combines SubBytes for a byte at row i, and
+ * MixColumns for the column where that byte goes after ShiftRows.
+ */
+
+static const sph_u32 AES0[256] = {
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+static const sph_u32 AES1[256] = {
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+static const sph_u32 AES2[256] = {
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+static const sph_u32 AES3[256] = {
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
diff --git a/sph/blake.c b/sph/blake.c
new file mode 100644
index 00000000..672a6a2f
--- /dev/null
+++ b/sph/blake.c
@@ -0,0 +1,1112 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_blake.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_SMALL_FOOTPRINT_BLAKE   1
+#endif
+
+#if SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_COMPACT_BLAKE_32   1
+#endif
+
+#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
+#define SPH_COMPACT_BLAKE_64   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507),
+	SPH_C32(0x3070DD17), SPH_C32(0xF70E5939),
+	SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 IV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#endif
+
+#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+
+static const unsigned sigma[16][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
+};
+
+/*
+  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+ 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
+ 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
+  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
+  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
+  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
+ 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
+ 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
+  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
+ 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
+*/
+#endif
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#if SPH_COMPACT_BLAKE_32
+
+static const sph_u32 CS[16] = {
+	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+};
+
+#endif
+
+#if SPH_64
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#if SPH_COMPACT_BLAKE_64
+
+static const sph_u64 CB[16] = {
+	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+};
+
+#endif
+
+#endif
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define ROUND_S(r)   do { \
+		GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
+		GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
+		GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
+		GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
+		GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
+		GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
+		GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
+		GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T64(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR64(d ^ a, 32); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 25); \
+		a = SPH_T64(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR64(d ^ a, 16); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 11); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define ROUND_B(r)   do { \
+		GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
+		GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
+		GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
+		GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
+		GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
+		GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
+		GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
+		GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_B(r)   do { \
+		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#endif
+
+#define DECL_STATE32 \
+	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u32 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE32(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE32(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define COMPRESS32   do { \
+		sph_u32 M[16]; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M[0x0] = sph_dec32be_aligned(buf +  0); \
+		M[0x1] = sph_dec32be_aligned(buf +  4); \
+		M[0x2] = sph_dec32be_aligned(buf +  8); \
+		M[0x3] = sph_dec32be_aligned(buf + 12); \
+		M[0x4] = sph_dec32be_aligned(buf + 16); \
+		M[0x5] = sph_dec32be_aligned(buf + 20); \
+		M[0x6] = sph_dec32be_aligned(buf + 24); \
+		M[0x7] = sph_dec32be_aligned(buf + 28); \
+		M[0x8] = sph_dec32be_aligned(buf + 32); \
+		M[0x9] = sph_dec32be_aligned(buf + 36); \
+		M[0xA] = sph_dec32be_aligned(buf + 40); \
+		M[0xB] = sph_dec32be_aligned(buf + 44); \
+		M[0xC] = sph_dec32be_aligned(buf + 48); \
+		M[0xD] = sph_dec32be_aligned(buf + 52); \
+		M[0xE] = sph_dec32be_aligned(buf + 56); \
+		M[0xF] = sph_dec32be_aligned(buf + 60); \
+		for (r = 0; r < 14; r ++) \
+			ROUND_S(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		ROUND_S(8); \
+		ROUND_S(9); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define DECL_STATE64 \
+	sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u64 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE64(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE64(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define COMPRESS64   do { \
+		sph_u64 M[16]; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M[0x0] = sph_dec64be_aligned(buf +   0); \
+		M[0x1] = sph_dec64be_aligned(buf +   8); \
+		M[0x2] = sph_dec64be_aligned(buf +  16); \
+		M[0x3] = sph_dec64be_aligned(buf +  24); \
+		M[0x4] = sph_dec64be_aligned(buf +  32); \
+		M[0x5] = sph_dec64be_aligned(buf +  40); \
+		M[0x6] = sph_dec64be_aligned(buf +  48); \
+		M[0x7] = sph_dec64be_aligned(buf +  56); \
+		M[0x8] = sph_dec64be_aligned(buf +  64); \
+		M[0x9] = sph_dec64be_aligned(buf +  72); \
+		M[0xA] = sph_dec64be_aligned(buf +  80); \
+		M[0xB] = sph_dec64be_aligned(buf +  88); \
+		M[0xC] = sph_dec64be_aligned(buf +  96); \
+		M[0xD] = sph_dec64be_aligned(buf + 104); \
+		M[0xE] = sph_dec64be_aligned(buf + 112); \
+		M[0xF] = sph_dec64be_aligned(buf + 120); \
+		for (r = 0; r < 16; r ++) \
+			ROUND_B(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS64   do { \
+		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M0 = sph_dec64be_aligned(buf +   0); \
+		M1 = sph_dec64be_aligned(buf +   8); \
+		M2 = sph_dec64be_aligned(buf +  16); \
+		M3 = sph_dec64be_aligned(buf +  24); \
+		M4 = sph_dec64be_aligned(buf +  32); \
+		M5 = sph_dec64be_aligned(buf +  40); \
+		M6 = sph_dec64be_aligned(buf +  48); \
+		M7 = sph_dec64be_aligned(buf +  56); \
+		M8 = sph_dec64be_aligned(buf +  64); \
+		M9 = sph_dec64be_aligned(buf +  72); \
+		MA = sph_dec64be_aligned(buf +  80); \
+		MB = sph_dec64be_aligned(buf +  88); \
+		MC = sph_dec64be_aligned(buf +  96); \
+		MD = sph_dec64be_aligned(buf + 104); \
+		ME = sph_dec64be_aligned(buf + 112); \
+		MF = sph_dec64be_aligned(buf + 120); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		ROUND_B(6); \
+		ROUND_B(7); \
+		ROUND_B(8); \
+		ROUND_B(9); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#endif
+
+static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+
+static void
+blake32_init(sph_blake_small_context *sc,
+	const sph_u32 *iv, const sph_u32 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake32(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake32_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf, 64);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
+#if SPH_64
+
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+
+static void
+blake64_init(sph_blake_big_context *sc,
+	const sph_u64 *iv, const sph_u64 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u64));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u64));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake64(sph_blake_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE64
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE64(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+				T1 = SPH_T64(T1 + 1);
+			COMPRESS64;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE64(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake64_close(sph_blake_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+	union {
+		unsigned char buf[128];
+		sph_u64 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u64 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+		sc->T1 = SPH_T64(sc->T1 - 1);
+	} else {
+		sc->T0 -= 1024 - bit_len;
+	}
+	if (bit_len <= 894) {
+		memset(u.buf + ptr + 1, 0, 111 - ptr);
+		if (out_size_w64 == 8)
+			u.buf[111] |= 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 127 - ptr);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+		memset(u.buf, 0, 112);
+		if (out_size_w64 == 8)
+			u.buf[111] = 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf, 128);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w64; k ++)
+		sph_enc64be(out + (k << 3), sc->H[k]);
+}
+
+#endif
+
+/* see sph_blake.h */
+void
+sph_blake224_init(void *cc)
+{
+	blake32_init(cc, IV224, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_close(void *cc, void *dst)
+{
+	sph_blake224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 7);
+	sph_blake224_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_init(void *cc)
+{
+	blake32_init(cc, IV256, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_close(void *cc, void *dst)
+{
+	sph_blake256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 8);
+	sph_blake256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_blake.h */
+void
+sph_blake384_init(void *cc)
+{
+	blake64_init(cc, IV384, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_close(void *cc, void *dst)
+{
+	sph_blake384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 6);
+	sph_blake384_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_init(void *cc)
+{
+	blake64_init(cc, IV512, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_close(void *cc, void *dst)
+{
+	sph_blake512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 8);
+	sph_blake512_init(cc);
+}
+
+#endif
diff --git a/sph/bmw.c b/sph/bmw.c
new file mode 100644
index 00000000..718191d0
--- /dev/null
+++ b/sph/bmw.c
@@ -0,0 +1,957 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_bmw.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0x00010203), SPH_C32(0x04050607),
+	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
+	SPH_C32(0x10111213), SPH_C32(0x14151617),
+	SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F),
+	SPH_C32(0x20212223), SPH_C32(0x24252627),
+	SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F),
+	SPH_C32(0x30313233), SPH_C32(0x34353637),
+	SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F),
+	SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F),
+	SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F),
+	SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F),
+	SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F),
+	SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F),
+	SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F),
+	SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#if SPH_64
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qab   do { \
+		unsigned u; \
+		sph_u64 Wb[16]; \
+		Wb[ 0] = Wb0; \
+		Wb[ 1] = Wb1; \
+		Wb[ 2] = Wb2; \
+		Wb[ 3] = Wb3; \
+		Wb[ 4] = Wb4; \
+		Wb[ 5] = Wb5; \
+		Wb[ 6] = Wb6; \
+		Wb[ 7] = Wb7; \
+		Wb[ 8] = Wb8; \
+		Wb[ 9] = Wb9; \
+		Wb[10] = Wb10; \
+		Wb[11] = Wb11; \
+		Wb[12] = Wb12; \
+		Wb[13] = Wb13; \
+		Wb[14] = Wb14; \
+		Wb[15] = Wb15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		unsigned u; \
+		for (u = 16; u < 18; u ++) \
+			qt[u] = expand1b(Qb, M, H, u); \
+		for (u = 18; u < 32; u ++) \
+			qt[u] = expand2b(Qb, M, H, u); \
+	} while (0)
+
+#else
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#endif
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#if SPH_64
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#endif
+
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#else
+	sph_u32 mv[16];
+
+	mv[ 0] = sph_dec32le_aligned(data +  0);
+	mv[ 1] = sph_dec32le_aligned(data +  4);
+	mv[ 2] = sph_dec32le_aligned(data +  8);
+	mv[ 3] = sph_dec32le_aligned(data + 12);
+	mv[ 4] = sph_dec32le_aligned(data + 16);
+	mv[ 5] = sph_dec32le_aligned(data + 20);
+	mv[ 6] = sph_dec32le_aligned(data + 24);
+	mv[ 7] = sph_dec32le_aligned(data + 28);
+	mv[ 8] = sph_dec32le_aligned(data + 32);
+	mv[ 9] = sph_dec32le_aligned(data + 36);
+	mv[10] = sph_dec32le_aligned(data + 40);
+	mv[11] = sph_dec32le_aligned(data + 44);
+	mv[12] = sph_dec32le_aligned(data + 48);
+	mv[13] = sph_dec32le_aligned(data + 52);
+	mv[14] = sph_dec32le_aligned(data + 56);
+	mv[15] = sph_dec32le_aligned(data + 60);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+
+#if SPH_64
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#else
+	sph_u64 mv[16];
+
+	mv[ 0] = sph_dec64le_aligned(data +   0);
+	mv[ 1] = sph_dec64le_aligned(data +   8);
+	mv[ 2] = sph_dec64le_aligned(data +  16);
+	mv[ 3] = sph_dec64le_aligned(data +  24);
+	mv[ 4] = sph_dec64le_aligned(data +  32);
+	mv[ 5] = sph_dec64le_aligned(data +  40);
+	mv[ 6] = sph_dec64le_aligned(data +  48);
+	mv[ 7] = sph_dec64le_aligned(data +  56);
+	mv[ 8] = sph_dec64le_aligned(data +  64);
+	mv[ 9] = sph_dec64le_aligned(data +  72);
+	mv[10] = sph_dec64le_aligned(data +  80);
+	mv[11] = sph_dec64le_aligned(data +  88);
+	mv[12] = sph_dec64le_aligned(data +  96);
+	mv[13] = sph_dec64le_aligned(data + 104);
+	mv[14] = sph_dec64le_aligned(data + 112);
+	mv[15] = sph_dec64le_aligned(data + 120);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+static void
+bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+static void
+bmw64(sph_bmw_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u64 htmp[16];
+	sph_u64 *h1, *h2;
+
+	sc->bit_count += (sph_u64)len << 3;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u64 *ht;
+
+			compress_big(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u64 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_big(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+	compress_big(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc64le_aligned(buf + 8 * u, h2[u]);
+	compress_big(buf, final_b, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+		sph_enc64le(out + 8 * u, h1[v]);
+}
+
+#endif
+
+/* see sph_bmw.h */
+void
+sph_bmw224_init(void *cc)
+{
+	bmw32_init(cc, IV224);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_close(void *cc, void *dst)
+{
+	sph_bmw224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 7);
+	sph_bmw224_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_init(void *cc)
+{
+	bmw32_init(cc, IV256);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_close(void *cc, void *dst)
+{
+	sph_bmw256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 8);
+	sph_bmw256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_bmw.h */
+void
+sph_bmw384_init(void *cc)
+{
+	bmw64_init(cc, IV384);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_close(void *cc, void *dst)
+{
+	sph_bmw384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 6);
+	sph_bmw384_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_init(void *cc)
+{
+	bmw64_init(cc, IV512);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_close(void *cc, void *dst)
+{
+	sph_bmw512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 8);
+	sph_bmw512_init(cc);
+}
+
+#endif
diff --git a/sph/cubehash.c b/sph/cubehash.c
new file mode 100644
index 00000000..f993c05b
--- /dev/null
+++ b/sph/cubehash.c
@@ -0,0 +1,717 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
diff --git a/sph/echo.c b/sph/echo.c
new file mode 100644
index 00000000..de2f9040
--- /dev/null
+++ b/sph/echo.c
@@ -0,0 +1,1024 @@
+/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * ECHO implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_echo.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO
+#define SPH_SMALL_FOOTPRINT_ECHO   1
+#endif
+
+/*
+ * Some measures tend to show that the 64-bit implementation offers
+ * better performance only on a "64-bit architectures", those which have
+ * actual 64-bit registers.
+ */
+#if !defined SPH_ECHO_64 && SPH_64_TRUE
+#define SPH_ECHO_64   1
+#endif
+
+/*
+ * We can use a 64-bit implementation only if a 64-bit type is available.
+ */
+#if !SPH_64
+#undef SPH_ECHO_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define T32   SPH_T32
+#define C32   SPH_C32
+#if SPH_64
+#define C64   SPH_C64
+#endif
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+#if SPH_ECHO_64
+
+#define DECL_STATE_SMALL   \
+	sph_u64 W[16][2];
+
+#define DECL_STATE_BIG   \
+	sph_u64 W[16][2];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u64 W[16][2],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u64 Wl = W[n][0];
+		sph_u64 Wh = W[n][1];
+		sph_u32 X0 = (sph_u32)Wl;
+		sph_u32 X1 = (sph_u32)(Wl >> 32);
+		sph_u32 X2 = (sph_u32)Wh;
+		sph_u32 X3 = (sph_u32)(Wh >> 32);
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+		W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32);
+		W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 X0 = (sph_u32)(X[0]); \
+		sph_u32 X1 = (sph_u32)(X[0] >> 32); \
+		sph_u32 X2 = (sph_u32)(X[1]); \
+		sph_u32 X3 = (sph_u32)(X[1] >> 32); \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \
+		X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \
+		X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 2; n ++) {
+		sph_u64 a = W[ia][n];
+		sph_u64 b = W[ib][n];
+		sph_u64 c = W[ic][n];
+		sph_u64 d = W[id][n];
+		sph_u64 ab = a ^ b;
+		sph_u64 bc = b ^ c;
+		sph_u64 cd = c ^ d;
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u64 a = W[ia][n]; \
+		sph_u64 b = W[ib][n]; \
+		sph_u64 c = W[ic][n]; \
+		sph_u64 d = W[id][n]; \
+		sph_u64 ab = a ^ b; \
+		sph_u64 bc = b ^ c; \
+		sph_u64 cd = c ^ d; \
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 8; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \
+				^ WW[u] ^ WW[u + 8] \
+				^ WW[u + 16] ^ WW[u + 24]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ WW[u] ^ WW[u + 16]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	sph_u32 W[16][4];
+
+#define DECL_STATE_BIG   \
+	sph_u32 W[16][4];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 4][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 4][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 8][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 8][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u32 W[16][4],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u32 *X = W[n];
+		sph_u32 Y0, Y1, Y2, Y3;
+		AES_ROUND_LE(X[0], X[1], X[2], X[3],
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X[0], X[1], X[2], X[3], \
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[b][2]; \
+		W[b][2] = W[c][2]; \
+		W[c][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[b][3]; \
+		W[b][3] = W[c][3]; \
+		W[c][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[c][2]; \
+		W[c][2] = tmp; \
+		tmp = W[b][2]; \
+		W[b][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[c][3]; \
+		W[c][3] = tmp; \
+		tmp = W[b][3]; \
+		W[b][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 4; n ++) {
+		sph_u32 a = W[ia][n];
+		sph_u32 b = W[ib][n];
+		sph_u32 c = W[ic][n];
+		sph_u32 d = W[id][n];
+		sph_u32 ab = a ^ b;
+		sph_u32 bc = b ^ c;
+		sph_u32 cd = c ^ d;
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U
+			^ ((ab & C32(0x7F7F7F7F)) << 1);
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U
+			^ ((bc & C32(0x7F7F7F7F)) << 1);
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U
+			^ ((cd & C32(0x7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u32 a = W[ia][n]; \
+		sph_u32 b = W[ib][n]; \
+		sph_u32 c = W[ic][n]; \
+		sph_u32 d = W[id][n]; \
+		sph_u32 ab = a ^ b; \
+		sph_u32 bc = b ^ c; \
+		sph_u32 cd = c ^ d; \
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \
+			^ ((ab & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \
+			^ ((bc & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \
+			^ ((cd & C32(0x7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+		MIX_COLUMN1(a, b, c, d, 2); \
+		MIX_COLUMN1(a, b, c, d, 3); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \
+				^ WW[u] ^ WW[u + 16] \
+				^ WW[u + 32] ^ WW[u + 48]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 32; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ WW[u] ^ WW[u + 32]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#endif
+
+#define INCR_COUNTER(sc, val)   do { \
+		sc->C0 = T32(sc->C0 + (sph_u32)(val)); \
+		if (sc->C0 < (sph_u32)(val)) { \
+			if ((sc->C1 = T32(sc->C1 + 1)) == 0) \
+				if ((sc->C2 = T32(sc->C2 + 1)) == 0) \
+					sc->C3 = T32(sc->C3 + 1); \
+		} \
+	} while (0)
+
+static void
+echo_small_init(sph_echo_small_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_big_init(sph_echo_big_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+	sc->u.Vb[4][0] = (sph_u64)out_len;
+	sc->u.Vb[4][1] = 0;
+	sc->u.Vb[5][0] = (sph_u64)out_len;
+	sc->u.Vb[5][1] = 0;
+	sc->u.Vb[6][0] = (sph_u64)out_len;
+	sc->u.Vb[6][1] = 0;
+	sc->u.Vb[7][0] = (sph_u64)out_len;
+	sc->u.Vb[7][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+	sc->u.Vs[4][0] = (sph_u32)out_len;
+	sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0;
+	sc->u.Vs[5][0] = (sph_u32)out_len;
+	sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0;
+	sc->u.Vs[6][0] = (sph_u32)out_len;
+	sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0;
+	sc->u.Vs[7][0] = (sph_u32)out_len;
+	sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_small_compress(sph_echo_small_context *sc)
+{
+	DECL_STATE_SMALL
+
+	COMPRESS_SMALL(sc);
+}
+
+static void
+echo_big_compress(sph_echo_big_context *sc)
+{
+	DECL_STATE_BIG
+
+	COMPRESS_BIG(sc);
+}
+
+static void
+echo_small_core(sph_echo_small_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1536);
+			echo_small_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_big_core(sph_echo_big_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1024);
+			echo_big_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[32];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_small_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_small_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_small_init(sc, out_size_w32 << 5);
+}
+
+static void
+echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[64];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_big_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_big_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_big_init(sc, out_size_w32 << 5);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_init(void *cc)
+{
+	echo_small_init(cc, 224);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_init(void *cc)
+{
+	echo_small_init(cc, 256);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_init(void *cc)
+{
+	echo_big_init(cc, 384);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_init(void *cc)
+{
+	echo_big_init(cc, 512);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 16);
+}
diff --git a/sph/groestl.c b/sph/groestl.c
new file mode 100644
index 00000000..3e83961d
--- /dev/null
+++ b/sph/groestl.c
@@ -0,0 +1,3115 @@
+/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
+/*
+ * Groestl implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_groestl.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL
+#define SPH_SMALL_FOOTPRINT_GROESTL   1
+#endif
+
+/*
+ * Apparently, the 32-bit-only version is not faster than the 64-bit
+ * version unless using the "small footprint" code on a 32-bit machine.
+ */
+#if !defined SPH_GROESTL_64
+#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE
+#define SPH_GROESTL_64   0
+#else
+#define SPH_GROESTL_64   1
+#endif
+#endif
+
+#if !SPH_64
+#undef SPH_GROESTL_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal representation may use either big-endian or
+ * little-endian. Using the platform default representation speeds up
+ * encoding and decoding between bytes and the matrix columns.
+ */
+
+#undef USE_LE
+#if SPH_GROESTL_LITTLE_ENDIAN
+#define USE_LE   1
+#elif SPH_GROESTL_BIG_ENDIAN
+#define USE_LE   0
+#elif SPH_LITTLE_ENDIAN
+#define USE_LE   1
+#endif
+
+#if USE_LE
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+#define B32_0(x)    ((x) & 0xFF)
+#define B32_1(x)    (((x) >> 8) & 0xFF)
+#define B32_2(x)    (((x) >> 16) & 0xFF)
+#define B32_3(x)    ((x) >> 24)
+
+#define R32u(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+#define R32d(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24)))
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#define B64_0(x)    ((x) & 0xFF)
+#define B64_1(x)    (((x) >> 8) & 0xFF)
+#define B64_2(x)    (((x) >> 16) & 0xFF)
+#define B64_3(x)    (((x) >> 24) & 0xFF)
+#define B64_4(x)    (((x) >> 32) & 0xFF)
+#define B64_5(x)    (((x) >> 40) & 0xFF)
+#define B64_6(x)    (((x) >> 48) & 0xFF)
+#define B64_7(x)    ((x) >> 56)
+#define R64         SPH_ROTL64
+#define PC64(j, r)  ((sph_u64)((j) + (r)))
+#define QC64(j, r)  (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56)))
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#define B32_0(x)    ((x) >> 24)
+#define B32_1(x)    (((x) >> 16) & 0xFF)
+#define B32_2(x)    (((x) >> 8) & 0xFF)
+#define B32_3(x)    ((x) & 0xFF)
+
+#define R32u(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+#define R32d(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)) << 24)
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j)))
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#define B64_0(x)    ((x) >> 56)
+#define B64_1(x)    (((x) >> 48) & 0xFF)
+#define B64_2(x)    (((x) >> 40) & 0xFF)
+#define B64_3(x)    (((x) >> 32) & 0xFF)
+#define B64_4(x)    (((x) >> 24) & 0xFF)
+#define B64_5(x)    (((x) >> 16) & 0xFF)
+#define B64_6(x)    (((x) >> 8) & 0xFF)
+#define B64_7(x)    ((x) & 0xFF)
+#define R64         SPH_ROTR64
+#define PC64(j, r)  ((sph_u64)((j) + (r)) << 56)
+#define QC64(j, r)  ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j)))
+#endif
+
+#endif
+
+#if SPH_GROESTL_64
+
+static const sph_u64 T0[] = {
+	C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8),
+	C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6),
+	C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6),
+	C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491),
+	C64e(0x6090f050f0c05060), C64e(0x0207050305040302),
+	C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56),
+	C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5),
+	C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec),
+	C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f),
+	C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa),
+	C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2),
+	C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb),
+	C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3),
+	C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45),
+	C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753),
+	C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b),
+	C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1),
+	C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c),
+	C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e),
+	C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83),
+	C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451),
+	C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9),
+	C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab),
+	C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a),
+	C64e(0x081c140c14100c08), C64e(0x9563f652f6315295),
+	C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d),
+	C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137),
+	C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f),
+	C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624),
+	C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf),
+	C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e),
+	C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea),
+	C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d),
+	C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34),
+	C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc),
+	C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b),
+	C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76),
+	C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d),
+	C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd),
+	C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713),
+	C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9),
+	C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1),
+	C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3),
+	C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6),
+	C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d),
+	C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72),
+	C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498),
+	C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85),
+	C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5),
+	C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed),
+	C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a),
+	C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411),
+	C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9),
+	C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe),
+	C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478),
+	C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b),
+	C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d),
+	C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05),
+	C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21),
+	C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1),
+	C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177),
+	C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342),
+	C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5),
+	C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf),
+	C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418),
+	C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3),
+	C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235),
+	C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e),
+	C64e(0x936af957f93d5793), C64e(0x55580df20daaf255),
+	C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a),
+	C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba),
+	C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6),
+	C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819),
+	C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3),
+	C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54),
+	C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b),
+	C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7),
+	C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28),
+	C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc),
+	C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad),
+	C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664),
+	C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14),
+	C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c),
+	C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8),
+	C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd),
+	C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4),
+	C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431),
+	C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2),
+	C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b),
+	C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da),
+	C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1),
+	C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049),
+	C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac),
+	C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf),
+	C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4),
+	C64e(0x476720e9208ee947), C64e(0x1038281828201810),
+	C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0),
+	C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c),
+	C64e(0x38546c246c702438), C64e(0x575f08f108aef157),
+	C64e(0x732152c752e6c773), C64e(0x9764f351f3355197),
+	C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1),
+	C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e),
+	C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61),
+	C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f),
+	C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c),
+	C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc),
+	C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506),
+	C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c),
+	C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a),
+	C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069),
+	C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899),
+	C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927),
+	C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb),
+	C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322),
+	C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9),
+	C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733),
+	C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c),
+	C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9),
+	C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa),
+	C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5),
+	C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859),
+	C64e(0x09929b809b128009), C64e(0x1a2339173934171a),
+	C64e(0x651075da75cada65), C64e(0xd784533153b531d7),
+	C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0),
+	C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029),
+	C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e),
+	C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8),
+	C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T1[] = {
+	C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84),
+	C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d),
+	C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd),
+	C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954),
+	C64e(0x606090f050f0c050), C64e(0x0202070503050403),
+	C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d),
+	C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162),
+	C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a),
+	C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d),
+	C64e(0x898949c040c00940), C64e(0xfafa68928792ef87),
+	C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb),
+	C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b),
+	C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67),
+	C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea),
+	C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7),
+	C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b),
+	C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c),
+	C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a),
+	C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41),
+	C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f),
+	C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4),
+	C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908),
+	C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73),
+	C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f),
+	C64e(0x08081c140c14100c), C64e(0x959563f652f63152),
+	C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e),
+	C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1),
+	C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5),
+	C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836),
+	C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d),
+	C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69),
+	C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f),
+	C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e),
+	C64e(0x5858c49c749cb074), C64e(0x343446722e72682e),
+	C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2),
+	C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb),
+	C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d),
+	C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face),
+	C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e),
+	C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697),
+	C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968),
+	C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c),
+	C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f),
+	C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed),
+	C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146),
+	C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b),
+	C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4),
+	C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a),
+	C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a),
+	C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116),
+	C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7),
+	C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294),
+	C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910),
+	C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781),
+	C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044),
+	C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3),
+	C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe),
+	C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a),
+	C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc),
+	C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904),
+	C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1),
+	C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463),
+	C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a),
+	C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d),
+	C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014),
+	C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f),
+	C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2),
+	C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39),
+	C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2),
+	C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447),
+	C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7),
+	C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795),
+	C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298),
+	C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f),
+	C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e),
+	C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683),
+	C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529),
+	C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c),
+	C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2),
+	C64e(0x161631271d272c1d), C64e(0xadad379a769a4176),
+	C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856),
+	C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e),
+	C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a),
+	C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4),
+	C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e),
+	C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6),
+	C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4),
+	C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b),
+	C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43),
+	C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7),
+	C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964),
+	C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0),
+	C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa),
+	C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525),
+	C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e),
+	C64e(0x47476720e9208ee9), C64e(0x1010382818282018),
+	C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88),
+	C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872),
+	C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1),
+	C64e(0x73732152c752e6c7), C64e(0x979764f351f33551),
+	C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c),
+	C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21),
+	C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc),
+	C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85),
+	C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842),
+	C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa),
+	C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05),
+	C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812),
+	C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f),
+	C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0),
+	C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958),
+	C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9),
+	C64e(0xd9d991483848a938), C64e(0xebebde351335cd13),
+	C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433),
+	C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970),
+	C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7),
+	C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822),
+	C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920),
+	C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff),
+	C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a),
+	C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8),
+	C64e(0x0909929b809b1280), C64e(0x1a1a233917393417),
+	C64e(0x65651075da75cada), C64e(0xd7d784533153b531),
+	C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8),
+	C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0),
+	C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11),
+	C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc),
+	C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a)
+};
+
+static const sph_u64 T2[] = {
+	C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb),
+	C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7),
+	C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7),
+	C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39),
+	C64e(0x50606090f050f0c0), C64e(0x0302020705030504),
+	C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac),
+	C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671),
+	C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3),
+	C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e),
+	C64e(0x40898949c040c009), C64e(0x87fafa68928792ef),
+	C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f),
+	C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded),
+	C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d),
+	C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a),
+	C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6),
+	C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d),
+	C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9),
+	C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98),
+	C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc),
+	C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d),
+	C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2),
+	C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9),
+	C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d),
+	C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154),
+	C64e(0x0c08081c140c1410), C64e(0x52959563f652f631),
+	C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221),
+	C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e),
+	C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e),
+	C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48),
+	C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5),
+	C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c),
+	C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf),
+	C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a),
+	C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268),
+	C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3),
+	C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6),
+	C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec),
+	C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa),
+	C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1),
+	C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226),
+	C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869),
+	C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499),
+	C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd),
+	C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77),
+	C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01),
+	C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4),
+	C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b),
+	C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11),
+	C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91),
+	C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1),
+	C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f),
+	C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722),
+	C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9),
+	C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7),
+	C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0),
+	C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96),
+	C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba),
+	C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a),
+	C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42),
+	C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9),
+	C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee),
+	C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584),
+	C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1),
+	C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765),
+	C64e(0x4c818155d44cd419), C64e(0x141818243c143c30),
+	C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d),
+	C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a),
+	C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c),
+	C64e(0x5793936af957f93d), C64e(0xf25555580df20daa),
+	C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4),
+	C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f),
+	C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7),
+	C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332),
+	C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d),
+	C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8),
+	C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16),
+	C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95),
+	C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450),
+	C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63),
+	C64e(0x1d161631271d272c), C64e(0x76adad379a769a41),
+	C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8),
+	C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228),
+	C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18),
+	C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b),
+	C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261),
+	C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193),
+	C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762),
+	C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff),
+	C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d),
+	C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af),
+	C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79),
+	C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92),
+	C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543),
+	C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85),
+	C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3),
+	C64e(0xe947476720e9208e), C64e(0x1810103828182820),
+	C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb),
+	C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8),
+	C64e(0x243838546c246c70), C64e(0xf157575f08f108ae),
+	C64e(0xc773732152c752e6), C64e(0x51979764f351f335),
+	C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459),
+	C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c),
+	C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2),
+	C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e),
+	C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8),
+	C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583),
+	C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c),
+	C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638),
+	C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4),
+	C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2),
+	C64e(0x911717bfa891a82e), C64e(0x58999971e858e829),
+	C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e),
+	C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd),
+	C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544),
+	C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049),
+	C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266),
+	C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678),
+	C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089),
+	C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f),
+	C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51),
+	C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2),
+	C64e(0x800909929b809b12), C64e(0x171a1a2339173934),
+	C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5),
+	C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb),
+	C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52),
+	C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c),
+	C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b),
+	C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58)
+};
+
+static const sph_u64 T3[] = {
+	C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497),
+	C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c),
+	C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc),
+	C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc),
+	C64e(0xc050606090f050f0), C64e(0x0403020207050305),
+	C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87),
+	C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6),
+	C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5),
+	C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc),
+	C64e(0x0940898949c040c0), C64e(0xef87fafa68928792),
+	C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26),
+	C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d),
+	C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9),
+	C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25),
+	C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702),
+	C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed),
+	C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24),
+	C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe),
+	C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3),
+	C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1),
+	C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407),
+	C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818),
+	C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395),
+	C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41),
+	C64e(0x100c08081c140c14), C64e(0x3152959563f652f6),
+	C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2),
+	C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8),
+	C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4),
+	C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a),
+	C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47),
+	C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb),
+	C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba),
+	C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9),
+	C64e(0xb0745858c49c749c), C64e(0x682e343446722e72),
+	C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd),
+	C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16),
+	C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7),
+	C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49),
+	C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42),
+	C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2),
+	C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8),
+	C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74),
+	C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21),
+	C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c),
+	C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca),
+	C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd),
+	C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467),
+	C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade),
+	C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e),
+	C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a),
+	C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762),
+	C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7),
+	C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030),
+	C64e(0x080604040e0a060a), C64e(0xe781fefe66988198),
+	C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc),
+	C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e),
+	C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19),
+	C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85),
+	C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf),
+	C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c),
+	C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158),
+	C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5),
+	C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e),
+	C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7),
+	C64e(0x194c818155d44cd4), C64e(0x30141818243c143c),
+	C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71),
+	C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd),
+	C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b),
+	C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d),
+	C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9),
+	C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732),
+	C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4),
+	C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3),
+	C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81),
+	C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82),
+	C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e),
+	C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b),
+	C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44),
+	C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d),
+	C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a),
+	C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa),
+	C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22),
+	C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e),
+	C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437),
+	C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2),
+	C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1),
+	C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7),
+	C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86),
+	C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5),
+	C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2),
+	C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac),
+	C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b),
+	C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15),
+	C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f),
+	C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89),
+	C64e(0x8ee947476720e920), C64e(0x2018101038281828),
+	C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883),
+	C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296),
+	C64e(0x70243838546c246c), C64e(0xaef157575f08f108),
+	C64e(0xe6c773732152c752), C64e(0x3551979764f351f3),
+	C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84),
+	C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163),
+	C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f),
+	C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594),
+	C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6),
+	C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5),
+	C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f),
+	C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236),
+	C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1),
+	C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b),
+	C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8),
+	C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0),
+	C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335),
+	C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355),
+	C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090),
+	C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2),
+	C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266),
+	C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060),
+	C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a),
+	C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e),
+	C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813),
+	C64e(0x12800909929b809b), C64e(0x34171a1a23391739),
+	C64e(0xcada65651075da75), C64e(0xb531d7d784533153),
+	C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3),
+	C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb),
+	C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133),
+	C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f),
+	C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e)
+};
+
+#endif
+
+static const sph_u64 T4[] = {
+	C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784),
+	C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d),
+	C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd),
+	C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54),
+	C64e(0xf0c050606090f050), C64e(0x0504030202070503),
+	C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d),
+	C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662),
+	C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a),
+	C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d),
+	C64e(0xc00940898949c040), C64e(0x92ef87fafa689287),
+	C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb),
+	C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b),
+	C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967),
+	C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea),
+	C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7),
+	C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b),
+	C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c),
+	C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a),
+	C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341),
+	C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f),
+	C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4),
+	C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808),
+	C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573),
+	C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f),
+	C64e(0x14100c08081c140c), C64e(0xf63152959563f652),
+	C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e),
+	C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1),
+	C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5),
+	C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36),
+	C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d),
+	C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69),
+	C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f),
+	C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e),
+	C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e),
+	C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2),
+	C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb),
+	C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d),
+	C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce),
+	C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e),
+	C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297),
+	C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868),
+	C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c),
+	C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f),
+	C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced),
+	C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46),
+	C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b),
+	C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4),
+	C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a),
+	C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a),
+	C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16),
+	C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7),
+	C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794),
+	C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010),
+	C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881),
+	C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44),
+	C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3),
+	C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe),
+	C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a),
+	C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc),
+	C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04),
+	C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1),
+	C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563),
+	C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a),
+	C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d),
+	C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14),
+	C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f),
+	C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2),
+	C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39),
+	C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2),
+	C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947),
+	C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7),
+	C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495),
+	C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398),
+	C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f),
+	C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e),
+	C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83),
+	C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29),
+	C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c),
+	C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2),
+	C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76),
+	C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56),
+	C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e),
+	C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a),
+	C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4),
+	C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e),
+	C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6),
+	C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4),
+	C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b),
+	C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543),
+	C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7),
+	C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64),
+	C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0),
+	C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa),
+	C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25),
+	C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e),
+	C64e(0x208ee947476720e9), C64e(0x2820181010382818),
+	C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388),
+	C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672),
+	C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1),
+	C64e(0x52e6c773732152c7), C64e(0xf33551979764f351),
+	C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c),
+	C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321),
+	C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc),
+	C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485),
+	C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642),
+	C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa),
+	C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05),
+	C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612),
+	C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f),
+	C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0),
+	C64e(0xa82e911717bfa891), C64e(0xe82958999971e858),
+	C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9),
+	C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513),
+	C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533),
+	C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070),
+	C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7),
+	C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622),
+	C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020),
+	C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff),
+	C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a),
+	C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8),
+	C64e(0x9b12800909929b80), C64e(0x3934171a1a233917),
+	C64e(0x75cada65651075da), C64e(0x53b531d7d7845331),
+	C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8),
+	C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0),
+	C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311),
+	C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc),
+	C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T5[] = {
+	C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97),
+	C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c),
+	C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc),
+	C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc),
+	C64e(0x50f0c050606090f0), C64e(0x0305040302020705),
+	C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187),
+	C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6),
+	C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5),
+	C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc),
+	C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892),
+	C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426),
+	C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d),
+	C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9),
+	C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025),
+	C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102),
+	C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed),
+	C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524),
+	C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be),
+	C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3),
+	C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1),
+	C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607),
+	C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118),
+	C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95),
+	C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41),
+	C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6),
+	C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2),
+	C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8),
+	C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4),
+	C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a),
+	C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847),
+	C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb),
+	C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba),
+	C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9),
+	C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672),
+	C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd),
+	C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16),
+	C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7),
+	C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449),
+	C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42),
+	C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2),
+	C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8),
+	C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574),
+	C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221),
+	C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c),
+	C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca),
+	C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd),
+	C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67),
+	C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde),
+	C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e),
+	C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a),
+	C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862),
+	C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7),
+	C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930),
+	C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698),
+	C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc),
+	C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e),
+	C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419),
+	C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085),
+	C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf),
+	C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c),
+	C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58),
+	C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5),
+	C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e),
+	C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7),
+	C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c),
+	C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271),
+	C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd),
+	C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b),
+	C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d),
+	C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9),
+	C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832),
+	C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4),
+	C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3),
+	C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281),
+	C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682),
+	C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e),
+	C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b),
+	C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44),
+	C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d),
+	C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a),
+	C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa),
+	C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622),
+	C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e),
+	C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37),
+	C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2),
+	C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1),
+	C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7),
+	C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486),
+	C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5),
+	C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2),
+	C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac),
+	C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b),
+	C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915),
+	C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f),
+	C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89),
+	C64e(0xe9208ee947476720), C64e(0x1828201810103828),
+	C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383),
+	C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96),
+	C64e(0x246c70243838546c), C64e(0xf108aef157575f08),
+	C64e(0xc752e6c773732152), C64e(0x51f33551979764f3),
+	C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584),
+	C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63),
+	C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f),
+	C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94),
+	C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6),
+	C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5),
+	C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f),
+	C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36),
+	C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1),
+	C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b),
+	C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8),
+	C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0),
+	C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35),
+	C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755),
+	C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990),
+	C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2),
+	C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66),
+	C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960),
+	C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a),
+	C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e),
+	C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13),
+	C64e(0x809b12800909929b), C64e(0x173934171a1a2339),
+	C64e(0xda75cada65651075), C64e(0x3153b531d7d78453),
+	C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3),
+	C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb),
+	C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33),
+	C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f),
+	C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e)
+};
+
+static const sph_u64 T6[] = {
+	C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f),
+	C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a),
+	C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a),
+	C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d),
+	C64e(0xf050f0c050606090), C64e(0x0503050403020207),
+	C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1),
+	C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513),
+	C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59),
+	C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3),
+	C64e(0xc040c00940898949), C64e(0x928792ef87fafa68),
+	C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294),
+	C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6),
+	C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a),
+	C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560),
+	C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351),
+	C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76),
+	C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5),
+	C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2),
+	C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd),
+	C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352),
+	C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156),
+	C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1),
+	C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e),
+	C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b),
+	C64e(0x140c14100c08081c), C64e(0xf652f63152959563),
+	C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f),
+	C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf),
+	C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb),
+	C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e),
+	C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98),
+	C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5),
+	C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50),
+	C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4),
+	C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446),
+	C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11),
+	C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d),
+	C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1),
+	C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34),
+	C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f),
+	C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1),
+	C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901),
+	C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5),
+	C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2),
+	C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a),
+	C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47),
+	C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af),
+	C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff),
+	C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b),
+	C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb),
+	C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7),
+	C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8),
+	C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6),
+	C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9),
+	C64e(0x0a060a080604040e), C64e(0x988198e781fefe66),
+	C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4),
+	C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75),
+	C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44),
+	C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580),
+	C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe),
+	C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd),
+	C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f),
+	C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7),
+	C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb),
+	C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08),
+	C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824),
+	C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2),
+	C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8),
+	C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65),
+	C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558),
+	C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3),
+	C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88),
+	C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642),
+	C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa),
+	C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322),
+	C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6),
+	C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95),
+	C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc),
+	C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c),
+	C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81),
+	C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37),
+	C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e),
+	C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436),
+	C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12),
+	C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f),
+	C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f),
+	C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435),
+	C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6),
+	C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274),
+	C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e),
+	C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18),
+	C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d),
+	C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972),
+	C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9),
+	C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0),
+	C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d),
+	C64e(0x20e9208ee9474767), C64e(0x2818282018101038),
+	C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073),
+	C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca),
+	C64e(0x6c246c7024383854), C64e(0x08f108aef157575f),
+	C64e(0x52c752e6c7737321), C64e(0xf351f33551979764),
+	C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125),
+	C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d),
+	C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e),
+	C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b),
+	C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba),
+	C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29),
+	C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609),
+	C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a),
+	C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b),
+	C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902),
+	C64e(0xa891a82e911717bf), C64e(0xe858e82958999971),
+	C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7),
+	C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde),
+	C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277),
+	C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939),
+	C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1),
+	C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a),
+	C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9),
+	C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0),
+	C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b),
+	C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a),
+	C64e(0x9b809b1280090992), C64e(0x39173934171a1a23),
+	C64e(0x75da75cada656510), C64e(0x533153b531d7d784),
+	C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003),
+	C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2),
+	C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d),
+	C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7),
+	C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62)
+};
+
+static const sph_u64 T7[] = {
+	C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8),
+	C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6),
+	C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6),
+	C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191),
+	C64e(0x90f050f0c0506060), C64e(0x0705030504030202),
+	C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656),
+	C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5),
+	C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec),
+	C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f),
+	C64e(0x49c040c009408989), C64e(0x68928792ef87fafa),
+	C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2),
+	C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb),
+	C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3),
+	C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545),
+	C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353),
+	C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b),
+	C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1),
+	C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c),
+	C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e),
+	C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383),
+	C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151),
+	C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9),
+	C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab),
+	C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a),
+	C64e(0x1c140c14100c0808), C64e(0x63f652f631529595),
+	C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d),
+	C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737),
+	C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f),
+	C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424),
+	C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf),
+	C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e),
+	C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea),
+	C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d),
+	C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434),
+	C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc),
+	C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b),
+	C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676),
+	C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d),
+	C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd),
+	C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313),
+	C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9),
+	C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1),
+	C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3),
+	C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6),
+	C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d),
+	C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272),
+	C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898),
+	C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585),
+	C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5),
+	C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded),
+	C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a),
+	C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111),
+	C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9),
+	C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe),
+	C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878),
+	C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b),
+	C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d),
+	C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505),
+	C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121),
+	C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1),
+	C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777),
+	C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242),
+	C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5),
+	C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf),
+	C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818),
+	C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3),
+	C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535),
+	C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e),
+	C64e(0x6af957f93d579393), C64e(0x580df20daaf25555),
+	C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a),
+	C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba),
+	C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6),
+	C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919),
+	C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3),
+	C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454),
+	C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b),
+	C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7),
+	C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828),
+	C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc),
+	C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad),
+	C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464),
+	C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414),
+	C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c),
+	C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8),
+	C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd),
+	C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4),
+	C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131),
+	C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2),
+	C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b),
+	C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada),
+	C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1),
+	C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949),
+	C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac),
+	C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf),
+	C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4),
+	C64e(0x6720e9208ee94747), C64e(0x3828182820181010),
+	C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0),
+	C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c),
+	C64e(0x546c246c70243838), C64e(0x5f08f108aef15757),
+	C64e(0x2152c752e6c77373), C64e(0x64f351f335519797),
+	C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1),
+	C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e),
+	C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161),
+	C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f),
+	C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c),
+	C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc),
+	C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606),
+	C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c),
+	C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a),
+	C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969),
+	C64e(0xbfa891a82e911717), C64e(0x71e858e829589999),
+	C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727),
+	C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb),
+	C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222),
+	C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9),
+	C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333),
+	C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c),
+	C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9),
+	C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa),
+	C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5),
+	C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959),
+	C64e(0x929b809b12800909), C64e(0x2339173934171a1a),
+	C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7),
+	C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0),
+	C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929),
+	C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e),
+	C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8),
+	C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c)
+};
+
+#endif
+
+#define DECL_STATE_SMALL \
+	sph_u64 H[8];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= PC64(0x00, r); \
+		a[1] ^= PC64(0x10, r); \
+		a[2] ^= PC64(0x20, r); \
+		a[3] ^= PC64(0x30, r); \
+		a[4] ^= PC64(0x40, r); \
+		a[5] ^= PC64(0x50, r); \
+		a[6] ^= PC64(0x60, r); \
+		a[7] ^= PC64(0x70, r); \
+		RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \
+		RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \
+		RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \
+		RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \
+		RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \
+		RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \
+		RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \
+		RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= QC64(0x00, r); \
+		a[1] ^= QC64(0x10, r); \
+		a[2] ^= QC64(0x20, r); \
+		a[3] ^= QC64(0x30, r); \
+		a[4] ^= QC64(0x40, r); \
+		a[5] ^= QC64(0x50, r); \
+		a[6] ^= QC64(0x60, r); \
+		a[7] ^= QC64(0x70, r); \
+		RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \
+		RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \
+		RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \
+		RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \
+		RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \
+		RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \
+		RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \
+		RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+/*
+ * Apparently, unrolling more than that confuses GCC, resulting in
+ * lower performance, even though L1 cache would be no problem.
+ */
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u64 g[8], m[8]; \
+		size_t u; \
+		for (u = 0; u < 8; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u64 x[8]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u64 H[16];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \
+				(u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \
+			RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \
+				(u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \
+				(u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \
+			RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \
+				(u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \
+				(u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \
+			RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \
+				(u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \
+				(u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \
+			RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \
+				(u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \
+				(u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \
+			RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \
+				(u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \
+				(u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \
+			RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \
+				(u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \
+		RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \
+		RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \
+		RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \
+		RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \
+		RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \
+		RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \
+		RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \
+		RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \
+		RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \
+		RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \
+		RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \
+		RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \
+		RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \
+		RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \
+		RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \
+		RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \
+		RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \
+		RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \
+		RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \
+		RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \
+		RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \
+		RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \
+		RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \
+		RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \
+		RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \
+		RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \
+		RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \
+		RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \
+		RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \
+		RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#endif
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+/* obsolete
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16], *ya; \
+		const sph_u64 *yc; \
+		size_t u; \
+		int i; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		ya = g; \
+		yc = CP; \
+		for (i = 0; i < 2; i ++) { \
+			PERM_BIG(ya, yc); \
+			ya = m; \
+			yc = CQ; \
+		} \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+#else
+*/
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+/* obsolete
+#endif
+*/
+
+#define FINAL_BIG   do { \
+		sph_u64 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#else
+
+static const sph_u32 T0up[] = {
+	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
+	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
+	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
+	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
+	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
+	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
+	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
+	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
+	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
+	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
+	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
+	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
+	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
+	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
+	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
+	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
+	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
+	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
+	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
+	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
+	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
+	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
+	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
+	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
+	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
+	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
+	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
+	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
+	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
+	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
+	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
+	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
+	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
+	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
+	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
+	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
+	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
+	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
+	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
+	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
+	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
+	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
+	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
+	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
+	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
+	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
+	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
+	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
+	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
+	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
+	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
+	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
+	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
+	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
+	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
+	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
+	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
+	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
+	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
+	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
+	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
+	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
+	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
+	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
+};
+
+static const sph_u32 T0dn[] = {
+	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
+	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
+	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
+	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
+	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
+	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
+	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
+	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
+	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
+	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
+	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
+	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
+	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
+	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
+	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
+	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
+	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
+	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
+	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
+	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
+	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
+	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
+	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
+	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
+	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
+	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
+	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
+	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
+	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
+	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
+	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
+	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
+	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
+	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
+	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
+	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
+	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
+	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
+	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
+	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
+	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
+	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
+	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
+	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
+	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
+	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
+	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
+	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
+	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
+	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
+	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
+	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
+	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
+	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
+	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
+	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
+	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
+	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
+	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
+	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
+	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
+	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
+	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
+	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
+};
+
+static const sph_u32 T1up[] = {
+	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
+	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
+	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
+	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
+	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
+	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
+	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
+	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
+	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
+	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
+	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
+	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
+	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
+	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
+	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
+	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
+	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
+	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
+	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
+	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
+	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
+	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
+	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
+	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
+	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
+	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
+	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
+	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
+	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
+	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
+	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
+	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
+	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
+	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
+	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
+	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
+	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
+	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
+	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
+	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
+	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
+	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
+	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
+	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
+	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
+	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
+	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
+	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
+	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
+	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
+	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
+	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
+	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
+	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
+	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
+	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
+	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
+	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
+	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
+	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
+	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
+	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
+	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
+	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
+};
+
+static const sph_u32 T1dn[] = {
+	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
+	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
+	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
+	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
+	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
+	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
+	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
+	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
+	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
+	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
+	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
+	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
+	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
+	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
+	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
+	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
+	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
+	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
+	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
+	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
+	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
+	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
+	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
+	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
+	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
+	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
+	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
+	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
+	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
+	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
+	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
+	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
+	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
+	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
+	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
+	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
+	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
+	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
+	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
+	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
+	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
+	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
+	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
+	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
+	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
+	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
+	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
+	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
+	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
+	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
+	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
+	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
+	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
+	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
+	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
+	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
+	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
+	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
+	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
+	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
+	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
+	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
+	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
+	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
+};
+
+static const sph_u32 T2up[] = {
+	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
+	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
+	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
+	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
+	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
+	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
+	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
+	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
+	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
+	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
+	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
+	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
+	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
+	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
+	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
+	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
+	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
+	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
+	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
+	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
+	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
+	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
+	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
+	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
+	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
+	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
+	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
+	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
+	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
+	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
+	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
+	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
+	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
+	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
+	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
+	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
+	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
+	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
+	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
+	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
+	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
+	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
+	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
+	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
+	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
+	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
+	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
+	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
+	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
+	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
+	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
+	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
+	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
+	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
+	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
+	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
+	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
+	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
+	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
+	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
+	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
+	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
+	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
+	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
+};
+
+static const sph_u32 T2dn[] = {
+	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
+	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
+	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
+	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
+	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
+	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
+	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
+	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
+	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
+	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
+	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
+	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
+	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
+	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
+	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
+	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
+	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
+	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
+	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
+	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
+	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
+	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
+	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
+	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
+	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
+	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
+	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
+	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
+	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
+	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
+	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
+	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
+	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
+	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
+	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
+	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
+	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
+	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
+	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
+	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
+	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
+	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
+	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
+	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
+	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
+	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
+	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
+	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
+	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
+	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
+	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
+	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
+	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
+	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
+	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
+	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
+	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
+	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
+	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
+	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
+	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
+	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
+	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
+	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
+};
+
+static const sph_u32 T3up[] = {
+	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
+	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
+	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
+	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
+	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
+	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
+	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
+	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
+	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
+	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
+	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
+	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
+	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
+	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
+	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
+	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
+	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
+	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
+	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
+	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
+	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
+	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
+	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
+	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
+	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
+	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
+	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
+	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
+	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
+	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
+	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
+	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
+	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
+	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
+	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
+	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
+	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
+	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
+	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
+	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
+	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
+	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
+	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
+	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
+	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
+	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
+	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
+	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
+	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
+	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
+	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
+	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
+	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
+	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
+	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
+	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
+	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
+	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
+	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
+	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
+	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
+	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
+	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
+	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
+};
+
+static const sph_u32 T3dn[] = {
+	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
+	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
+	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
+	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
+	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
+	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
+	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
+	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
+	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
+	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
+	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
+	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
+	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
+	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
+	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
+	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
+	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
+	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
+	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
+	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
+	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
+	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
+	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
+	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
+	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
+	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
+	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
+	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
+	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
+	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
+	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
+	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
+	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
+	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
+	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
+	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
+	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
+	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
+	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
+	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
+	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
+	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
+	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
+	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
+	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
+	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
+	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
+	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
+	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
+	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
+	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
+	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
+	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
+	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
+	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
+	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
+	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
+	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
+	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
+	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
+	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
+	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
+	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
+	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 H[16];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= PC32up(0x00, r); \
+		a[0x1] ^= PC32dn(0x00, r); \
+		a[0x2] ^= PC32up(0x10, r); \
+		a[0x3] ^= PC32dn(0x10, r); \
+		a[0x4] ^= PC32up(0x20, r); \
+		a[0x5] ^= PC32dn(0x20, r); \
+		a[0x6] ^= PC32up(0x30, r); \
+		a[0x7] ^= PC32dn(0x30, r); \
+		a[0x8] ^= PC32up(0x40, r); \
+		a[0x9] ^= PC32dn(0x40, r); \
+		a[0xA] ^= PC32up(0x50, r); \
+		a[0xB] ^= PC32dn(0x50, r); \
+		a[0xC] ^= PC32up(0x60, r); \
+		a[0xD] ^= PC32dn(0x60, r); \
+		a[0xE] ^= PC32up(0x70, r); \
+		a[0xF] ^= PC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= QC32up(0x00, r); \
+		a[0x1] ^= QC32dn(0x00, r); \
+		a[0x2] ^= QC32up(0x10, r); \
+		a[0x3] ^= QC32dn(0x10, r); \
+		a[0x4] ^= QC32up(0x20, r); \
+		a[0x5] ^= QC32dn(0x20, r); \
+		a[0x6] ^= QC32up(0x30, r); \
+		a[0x7] ^= QC32dn(0x30, r); \
+		a[0x8] ^= QC32up(0x40, r); \
+		a[0x9] ^= QC32dn(0x40, r); \
+		a[0xA] ^= QC32up(0x50, r); \
+		a[0xB] ^= QC32dn(0x50, r); \
+		a[0xC] ^= QC32up(0x60, r); \
+		a[0xD] ^= QC32dn(0x60, r); \
+		a[0xE] ^= QC32up(0x70, r); \
+		a[0xF] ^= QC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \
+		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \
+		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \
+		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \
+		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \
+		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \
+		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \
+		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u32 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u32 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u32 H[32];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		sph_u32 fu2 = T0up[B32_2(a[b2])]; \
+		sph_u32 fd2 = T0dn[B32_2(a[b2])]; \
+		sph_u32 fu3 = T1up[B32_3(a[b3])]; \
+		sph_u32 fd3 = T1dn[B32_3(a[b3])]; \
+		sph_u32 fu6 = T0up[B32_2(a[b6])]; \
+		sph_u32 fd6 = T0dn[B32_2(a[b6])]; \
+		sph_u32 fu7 = T1up[B32_3(a[b7])]; \
+		sph_u32 fd7 = T1dn[B32_3(a[b7])]; \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ R32u(fu2, fd2) \
+			^ R32u(fu3, fd3) \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ R32d(fu6, fd6) \
+			^ R32d(fu7, fd7); \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ R32d(fu2, fd2) \
+			^ R32d(fu3, fd3) \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ R32u(fu6, fd6) \
+			^ R32u(fu7, fd7); \
+	} while (0)
+
+#else
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				u + 0x00, (u + 0x02) & 0x1F, \
+				(u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				u + 0x02, (u + 0x04) & 0x1F, \
+				(u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				u + 0x04, (u + 0x06) & 0x1F, \
+				(u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \
+				(u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				u + 0x06, (u + 0x08) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \
+				(u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				(u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \
+				(u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				(u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \
+				(u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				(u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \
+				(u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				(u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \
+				(u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \
+		RBTT(0x02, 0x03, a, \
+			0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \
+		RBTT(0x04, 0x05, a, \
+			0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \
+		RBTT(0x06, 0x07, a, \
+			0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \
+		RBTT(0x08, 0x09, a, \
+			0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \
+		RBTT(0x0E, 0x0F, a, \
+			0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \
+		RBTT(0x10, 0x11, a, \
+			0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \
+		RBTT(0x12, 0x13, a, \
+			0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \
+		RBTT(0x14, 0x15, a, \
+			0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \
+		RBTT(0x16, 0x17, a, \
+			0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \
+		RBTT(0x18, 0x19, a, \
+			0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \
+		RBTT(0x1E, 0x1F, a, \
+			0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \
+		RBTT(0x02, 0x03, a, \
+			0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \
+		RBTT(0x04, 0x05, a, \
+			0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \
+		RBTT(0x06, 0x07, a, \
+			0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \
+		RBTT(0x08, 0x09, a, \
+			0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \
+		RBTT(0x0E, 0x0F, a, \
+			0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \
+		RBTT(0x10, 0x11, a, \
+			0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \
+		RBTT(0x12, 0x13, a, \
+			0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \
+		RBTT(0x14, 0x15, a, \
+			0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \
+		RBTT(0x16, 0x17, a, \
+			0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \
+		RBTT(0x18, 0x19, a, \
+			0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \
+		RBTT(0x1E, 0x1F, a, \
+			0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_P(a, r); \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_BIG   do { \
+		sph_u32 g[32], m[32]; \
+		size_t u; \
+		for (u = 0; u < 32; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		sph_u32 x[32]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#endif
+
+static void
+groestl_small_init(sph_groestl_small_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 7; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[7] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 15; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[15] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_SMALL(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_SMALL;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_small_close(sph_groestl_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[72];
+	size_t u, ptr, pad_len;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 56) {
+		pad_len = 64 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_small_core(sc, pad, pad_len);
+	READ_STATE_SMALL(sc);
+	FINAL_SMALL;
+#if SPH_GROESTL_64
+	for (u = 0; u < 4; u ++)
+		enc64e(pad + (u << 3), H[u + 4]);
+#else
+	for (u = 0; u < 8; u ++)
+		enc32e(pad + (u << 2), H[u + 8]);
+#endif
+	memcpy(dst, pad + 32 - out_len, out_len);
+	groestl_small_init(sc, (unsigned)out_len << 3);
+}
+
+static void
+groestl_big_init(sph_groestl_big_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 15; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[15] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 31; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[31] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_BIG;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_big_close(sph_groestl_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[136];
+	size_t ptr, pad_len, u;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 120) {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 256 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_big_core(sc, pad, pad_len);
+	READ_STATE_BIG(sc);
+	FINAL_BIG;
+#if SPH_GROESTL_64
+	for (u = 0; u < 8; u ++)
+		enc64e(pad + (u << 3), H[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(pad + (u << 2), H[u + 16]);
+#endif
+	memcpy(dst, pad + 64 - out_len, out_len);
+	groestl_big_init(sc, (unsigned)out_len << 3);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_init(void *cc)
+{
+	groestl_small_init(cc, 224);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_init(void *cc)
+{
+	groestl_small_init(cc, 256);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_init(void *cc)
+{
+	groestl_big_init(cc, 384);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_init(void *cc)
+{
+	groestl_big_init(cc, 512);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 64);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 64);
+}
diff --git a/sph/jh.c b/sph/jh.c
new file mode 100644
index 00000000..4e266172
--- /dev/null
+++ b/sph/jh.c
@@ -0,0 +1,1107 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_jh.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c)   do { \
+		x3 = ~x3; \
+		x0 ^= (c) & ~x2; \
+		tmp = (c) ^ (x0 & x1); \
+		x0 ^= x2 & x3; \
+		x3 ^= ~x1 & x2; \
+		x1 ^= x0 & x2; \
+		x2 ^= x0 & ~x3; \
+		x0 ^= x1 | x3; \
+		x3 ^= x1 & x2; \
+		x1 ^= tmp & x0; \
+		x2 ^= tmp; \
+	} while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		x4 ^= x1; \
+		x5 ^= x2; \
+		x6 ^= x3 ^ x0; \
+		x7 ^= x0; \
+		x0 ^= x5; \
+		x1 ^= x6; \
+		x2 ^= x7 ^ x4; \
+		x3 ^= x4; \
+	} while (0)
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	sph_u64 tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H.wide[ 0]; \
+		h0l = (state)->H.wide[ 1]; \
+		h1h = (state)->H.wide[ 2]; \
+		h1l = (state)->H.wide[ 3]; \
+		h2h = (state)->H.wide[ 4]; \
+		h2l = (state)->H.wide[ 5]; \
+		h3h = (state)->H.wide[ 6]; \
+		h3l = (state)->H.wide[ 7]; \
+		h4h = (state)->H.wide[ 8]; \
+		h4l = (state)->H.wide[ 9]; \
+		h5h = (state)->H.wide[10]; \
+		h5l = (state)->H.wide[11]; \
+		h6h = (state)->H.wide[12]; \
+		h6l = (state)->H.wide[13]; \
+		h7h = (state)->H.wide[14]; \
+		h7l = (state)->H.wide[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.wide[ 0] = h0h; \
+		(state)->H.wide[ 1] = h0l; \
+		(state)->H.wide[ 2] = h1h; \
+		(state)->H.wide[ 3] = h1l; \
+		(state)->H.wide[ 4] = h2h; \
+		(state)->H.wide[ 5] = h2l; \
+		(state)->H.wide[ 6] = h3h; \
+		(state)->H.wide[ 7] = h3l; \
+		(state)->H.wide[ 8] = h4h; \
+		(state)->H.wide[ 9] = h4l; \
+		(state)->H.wide[10] = h5h; \
+		(state)->H.wide[11] = h5l; \
+		(state)->H.wide[12] = h6h; \
+		(state)->H.wide[13] = h6l; \
+		(state)->H.wide[14] = h7h; \
+		(state)->H.wide[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u64 m0h = dec64e_aligned(buf +  0); \
+	sph_u64 m0l = dec64e_aligned(buf +  8); \
+	sph_u64 m1h = dec64e_aligned(buf + 16); \
+	sph_u64 m1l = dec64e_aligned(buf + 24); \
+	sph_u64 m2h = dec64e_aligned(buf + 32); \
+	sph_u64 m2l = dec64e_aligned(buf + 40); \
+	sph_u64 m3h = dec64e_aligned(buf + 48); \
+	sph_u64 m3l = dec64e_aligned(buf + 56); \
+	h0h ^= m0h; \
+	h0l ^= m0l; \
+	h1h ^= m1h; \
+	h1l ^= m1l; \
+	h2h ^= m2h; \
+	h2l ^= m2l; \
+	h3h ^= m3h; \
+	h3l ^= m3l;
+
+#define INPUT_BUF2 \
+	h4h ^= m0h; \
+	h4l ^= m0l; \
+	h5h ^= m1h; \
+	h5l ^= m1l; \
+	h6h ^= m2h; \
+	h6l ^= m2l; \
+	h7h ^= m3h; \
+	h7l ^= m3l;
+
+static const sph_u64 IV224[] = {
+	C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7),
+	C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494),
+	C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc),
+	C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1),
+	C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099),
+	C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68),
+	C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0),
+	C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e)
+};
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+static const sph_u64 IV384[] = {
+	C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b),
+	C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267),
+	C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249),
+	C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1),
+	C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e),
+	C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda),
+	C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71),
+	C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f)
+};
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+static const sph_u32 C[] = {
+	C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a),
+	C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6),
+	C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0),
+	C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a),
+	C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a),
+	C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb),
+	C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980),
+	C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc),
+	C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515),
+	C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850),
+	C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d),
+	C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce),
+	C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6),
+	C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a),
+	C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb),
+	C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197),
+	C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d),
+	C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c),
+	C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f),
+	C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80),
+	C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e),
+	C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315),
+	C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20),
+	C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36),
+	C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974),
+	C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186),
+	C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667),
+	C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727),
+	C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f),
+	C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0),
+	C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281),
+	C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062),
+	C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00),
+	C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8),
+	C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228),
+	C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2),
+	C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385),
+	C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a),
+	C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704),
+	C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a),
+	C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09),
+	C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0),
+	C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31),
+	C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48),
+	C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1),
+	C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f),
+	C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701),
+	C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a),
+	C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b),
+	C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630),
+	C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456),
+	C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae),
+	C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68),
+	C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116),
+	C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220),
+	C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518),
+	C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0),
+	C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba),
+	C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea),
+	C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee),
+	C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe),
+	C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842),
+	C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315),
+	C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83),
+	C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034),
+	C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd),
+	C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49),
+	C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d),
+	C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0),
+	C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53),
+	C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492),
+	C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c),
+	C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6),
+	C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d),
+	C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2),
+	C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc),
+	C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c),
+	C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175),
+	C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3),
+	C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f),
+	C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09),
+	C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2),
+	C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6),
+	C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56),
+	C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc),
+	C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e),
+	C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8),
+	C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163),
+	C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4),
+	C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16),
+	C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3),
+	C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30),
+	C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8),
+	C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992),
+	C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57),
+	C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505),
+	C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284),
+	C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547),
+	C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807),
+	C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e),
+	C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2),
+	C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883),
+	C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226),
+	C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7),
+	C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6),
+	C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e),
+	C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93),
+	C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9),
+	C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633),
+	C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570),
+	C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1),
+	C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2)
+};
+
+#define Ceven_w3(r)   (C[((r) << 3) + 0])
+#define Ceven_w2(r)   (C[((r) << 3) + 1])
+#define Ceven_w1(r)   (C[((r) << 3) + 2])
+#define Ceven_w0(r)   (C[((r) << 3) + 3])
+#define Codd_w3(r)    (C[((r) << 3) + 4])
+#define Codd_w2(r)    (C[((r) << 3) + 5])
+#define Codd_w1(r)    (C[((r) << 3) + 6])
+#define Codd_w0(r)    (C[((r) << 3) + 7])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \
+		Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \
+		Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \
+		Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \
+			x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \
+		Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \
+			x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \
+		Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \
+			x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \
+		Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \
+			x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u32 t = (x ## 3 & (c)) << (n); \
+		x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \
+		t = (x ## 2 & (c)) << (n); \
+		x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \
+		t = (x ## 1 & (c)) << (n); \
+		x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \
+		t = (x ## 0 & (c)) << (n); \
+		x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C32(0x55555555),  1)
+#define W1(x)   Wz(x, SPH_C32(0x33333333),  2)
+#define W2(x)   Wz(x, SPH_C32(0x0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C32(0x00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C32(0x0000FFFF), 16)
+#define W5(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 2; \
+		x ## 2 = t; \
+		t = x ## 1; \
+		x ## 1 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+#define W6(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 1; \
+		x ## 1 = t; \
+		t = x ## 2; \
+		x ## 2 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \
+	sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \
+	sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \
+	sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \
+	sph_u32 tmp;
+
+#define READ_STATE(state)   do { \
+		h03 = (state)->H.narrow[ 0]; \
+		h02 = (state)->H.narrow[ 1]; \
+		h01 = (state)->H.narrow[ 2]; \
+		h00 = (state)->H.narrow[ 3]; \
+		h13 = (state)->H.narrow[ 4]; \
+		h12 = (state)->H.narrow[ 5]; \
+		h11 = (state)->H.narrow[ 6]; \
+		h10 = (state)->H.narrow[ 7]; \
+		h23 = (state)->H.narrow[ 8]; \
+		h22 = (state)->H.narrow[ 9]; \
+		h21 = (state)->H.narrow[10]; \
+		h20 = (state)->H.narrow[11]; \
+		h33 = (state)->H.narrow[12]; \
+		h32 = (state)->H.narrow[13]; \
+		h31 = (state)->H.narrow[14]; \
+		h30 = (state)->H.narrow[15]; \
+		h43 = (state)->H.narrow[16]; \
+		h42 = (state)->H.narrow[17]; \
+		h41 = (state)->H.narrow[18]; \
+		h40 = (state)->H.narrow[19]; \
+		h53 = (state)->H.narrow[20]; \
+		h52 = (state)->H.narrow[21]; \
+		h51 = (state)->H.narrow[22]; \
+		h50 = (state)->H.narrow[23]; \
+		h63 = (state)->H.narrow[24]; \
+		h62 = (state)->H.narrow[25]; \
+		h61 = (state)->H.narrow[26]; \
+		h60 = (state)->H.narrow[27]; \
+		h73 = (state)->H.narrow[28]; \
+		h72 = (state)->H.narrow[29]; \
+		h71 = (state)->H.narrow[30]; \
+		h70 = (state)->H.narrow[31]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.narrow[ 0] = h03; \
+		(state)->H.narrow[ 1] = h02; \
+		(state)->H.narrow[ 2] = h01; \
+		(state)->H.narrow[ 3] = h00; \
+		(state)->H.narrow[ 4] = h13; \
+		(state)->H.narrow[ 5] = h12; \
+		(state)->H.narrow[ 6] = h11; \
+		(state)->H.narrow[ 7] = h10; \
+		(state)->H.narrow[ 8] = h23; \
+		(state)->H.narrow[ 9] = h22; \
+		(state)->H.narrow[10] = h21; \
+		(state)->H.narrow[11] = h20; \
+		(state)->H.narrow[12] = h33; \
+		(state)->H.narrow[13] = h32; \
+		(state)->H.narrow[14] = h31; \
+		(state)->H.narrow[15] = h30; \
+		(state)->H.narrow[16] = h43; \
+		(state)->H.narrow[17] = h42; \
+		(state)->H.narrow[18] = h41; \
+		(state)->H.narrow[19] = h40; \
+		(state)->H.narrow[20] = h53; \
+		(state)->H.narrow[21] = h52; \
+		(state)->H.narrow[22] = h51; \
+		(state)->H.narrow[23] = h50; \
+		(state)->H.narrow[24] = h63; \
+		(state)->H.narrow[25] = h62; \
+		(state)->H.narrow[26] = h61; \
+		(state)->H.narrow[27] = h60; \
+		(state)->H.narrow[28] = h73; \
+		(state)->H.narrow[29] = h72; \
+		(state)->H.narrow[30] = h71; \
+		(state)->H.narrow[31] = h70; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u32 m03 = dec32e_aligned(buf +  0); \
+	sph_u32 m02 = dec32e_aligned(buf +  4); \
+	sph_u32 m01 = dec32e_aligned(buf +  8); \
+	sph_u32 m00 = dec32e_aligned(buf + 12); \
+	sph_u32 m13 = dec32e_aligned(buf + 16); \
+	sph_u32 m12 = dec32e_aligned(buf + 20); \
+	sph_u32 m11 = dec32e_aligned(buf + 24); \
+	sph_u32 m10 = dec32e_aligned(buf + 28); \
+	sph_u32 m23 = dec32e_aligned(buf + 32); \
+	sph_u32 m22 = dec32e_aligned(buf + 36); \
+	sph_u32 m21 = dec32e_aligned(buf + 40); \
+	sph_u32 m20 = dec32e_aligned(buf + 44); \
+	sph_u32 m33 = dec32e_aligned(buf + 48); \
+	sph_u32 m32 = dec32e_aligned(buf + 52); \
+	sph_u32 m31 = dec32e_aligned(buf + 56); \
+	sph_u32 m30 = dec32e_aligned(buf + 60); \
+	h03 ^= m03; \
+	h02 ^= m02; \
+	h01 ^= m01; \
+	h00 ^= m00; \
+	h13 ^= m13; \
+	h12 ^= m12; \
+	h11 ^= m11; \
+	h10 ^= m10; \
+	h23 ^= m23; \
+	h22 ^= m22; \
+	h21 ^= m21; \
+	h20 ^= m20; \
+	h33 ^= m33; \
+	h32 ^= m32; \
+	h31 ^= m31; \
+	h30 ^= m30;
+
+#define INPUT_BUF2 \
+	h43 ^= m03; \
+	h42 ^= m02; \
+	h41 ^= m01; \
+	h40 ^= m00; \
+	h53 ^= m13; \
+	h52 ^= m12; \
+	h51 ^= m11; \
+	h50 ^= m10; \
+	h63 ^= m23; \
+	h62 ^= m22; \
+	h61 ^= m21; \
+	h60 ^= m20; \
+	h73 ^= m33; \
+	h72 ^= m32; \
+	h71 ^= m31; \
+	h70 ^= m30;
+
+static const sph_u32 IV224[] = {
+	C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7),
+	C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494),
+	C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc),
+	C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1),
+	C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099),
+	C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68),
+	C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0),
+	C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e)
+};
+
+static const sph_u32 IV256[] = {
+	C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1),
+	C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03),
+	C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477),
+	C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8),
+	C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262),
+	C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c),
+	C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f),
+	C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769)
+};
+
+static const sph_u32 IV384[] = {
+	C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b),
+	C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267),
+	C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249),
+	C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1),
+	C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e),
+	C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda),
+	C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71),
+	C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f)
+};
+
+static const sph_u32 IV512[] = {
+	C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543),
+	C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361),
+	C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80),
+	C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7),
+	C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a),
+	C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199),
+	C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156),
+	C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b)
+};
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+#define E8   do { \
+		unsigned r, g; \
+		for (r = g = 0; r < 42; r ++) { \
+			S(h0, h2, h4, h6, Ceven_, r); \
+			S(h1, h3, h5, h7, Codd_, r); \
+			L(h0, h2, h4, h6, h1, h3, h5, h7); \
+			switch (g) { \
+			case 0: \
+				W0(h1); \
+				W0(h3); \
+				W0(h5); \
+				W0(h7); \
+				break; \
+			case 1: \
+				W1(h1); \
+				W1(h3); \
+				W1(h5); \
+				W1(h7); \
+				break; \
+			case 2: \
+				W2(h1); \
+				W2(h3); \
+				W2(h5); \
+				W2(h7); \
+				break; \
+			case 3: \
+				W3(h1); \
+				W3(h3); \
+				W3(h5); \
+				W3(h7); \
+				break; \
+			case 4: \
+				W4(h1); \
+				W4(h3); \
+				W4(h5); \
+				W4(h7); \
+				break; \
+			case 5: \
+				W5(h1); \
+				W5(h3); \
+				W5(h5); \
+				W5(h7); \
+				break; \
+			case 6: \
+				W6(h1); \
+				W6(h3); \
+				W6(h5); \
+				W6(h7); \
+				break; \
+			} \
+			if (++ g == 7) \
+				g = 0; \
+		} \
+	} while (0)
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+/*
+ * We are not aiming at a small footprint, but we are still using a
+ * 32-bit implementation. Full loop unrolling would smash the L1
+ * cache on some "big" architectures (32 kB L1 cache).
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#endif
+
+#endif
+
+static void
+jh_init(sph_jh_context *sc, const void *iv)
+{
+	sc->ptr = 0;
+#if SPH_JH_64
+	memcpy(sc->H.wide, iv, sizeof sc->H.wide);
+#else
+	memcpy(sc->H.narrow, iv, sizeof sc->H.narrow);
+#endif
+#if SPH_64
+	sc->block_count = 0;
+#else
+	sc->block_count_high = 0;
+	sc->block_count_low = 0;
+#endif
+}
+
+static void
+jh_core(sph_jh_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BUF1;
+			E8;
+			INPUT_BUF2;
+#if SPH_64
+			sc->block_count ++;
+#else
+			if ((sc->block_count_low = SPH_T32(
+				sc->block_count_low + 1)) == 0)
+				sc->block_count_high ++;
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+jh_close(sph_jh_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32, const void *iv)
+{
+	unsigned z;
+	unsigned char buf[128];
+	size_t numz, u;
+#if SPH_64
+	sph_u64 l0, l1;
+#else
+	sph_u32 l0, l1, l2, l3;
+#endif
+
+	z = 0x80 >> n;
+	buf[0] = ((ub & -z) | z) & 0xFF;
+	if (sc->ptr == 0 && n == 0) {
+		numz = 47;
+	} else {
+		numz = 111 - sc->ptr;
+	}
+	memset(buf + 1, 0, numz);
+#if SPH_64
+	l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T64(sc->block_count >> 55);
+	sph_enc64be(buf + numz + 1, l1);
+	sph_enc64be(buf + numz + 9, l0);
+#else
+	l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T32(sc->block_count_low >> 23)
+		+ SPH_T32(sc->block_count_high << 9);
+	l2 = SPH_T32(sc->block_count_high >> 23);
+	l3 = 0;
+	sph_enc32be(buf + numz +  1, l3);
+	sph_enc32be(buf + numz +  5, l2);
+	sph_enc32be(buf + numz +  9, l1);
+	sph_enc32be(buf + numz + 13, l0);
+#endif
+	jh_core(sc, buf, numz + 17);
+#if SPH_JH_64
+	for (u = 0; u < 8; u ++)
+		enc64e(buf + (u << 3), sc->H.wide[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(buf + (u << 2), sc->H.narrow[u + 16]);
+#endif
+	memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2);
+	jh_init(sc, iv);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_init(void *cc)
+{
+	jh_init(cc, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_init(void *cc)
+{
+	jh_init(cc, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_init(void *cc)
+{
+	jh_init(cc, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_init(void *cc)
+{
+	jh_init(cc, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 16, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 16, IV512);
+}
diff --git a/sph/keccak.c b/sph/keccak.c
new file mode 100644
index 00000000..8dc74759
--- /dev/null
+++ b/sph/keccak.c
@@ -0,0 +1,1815 @@
+/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
+/*
+ * Keccak implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_keccak.h"
+
+/*
+ * Parameters:
+ *
+ *  SPH_KECCAK_64          use a 64-bit type
+ *  SPH_KECCAK_UNROLL      number of loops to unroll (0/undef for full unroll)
+ *  SPH_KECCAK_INTERLEAVE  use bit-interleaving (32-bit type only)
+ *  SPH_KECCAK_NOCOPY      do not copy the state into local variables
+ * 
+ * If there is no usable 64-bit type, the code automatically switches
+ * back to the 32-bit implementation.
+ *
+ * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
+ * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
+ * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
+ * 8 kB L1 code cache), seem to show that the following are optimal:
+ *
+ * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
+ * do not copy the state; unrolling 2, 6 or all rounds also provides
+ * near-optimal performance.
+ * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
+ * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
+ * also provides near-optimal performance.
+ * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
+ * copy the state. Unrolling 4 or 6 rounds is near-optimal.
+ * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
+ * copy the state.
+ * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
+ * the state. Unrolling only 1 round is also near-optimal.
+ *
+ * Also, interleaving does not always yield actual improvements when
+ * using a 32-bit implementation; in particular when the architecture
+ * does not offer a native rotation opcode (interleaving replaces one
+ * 64-bit rotation with two 32-bit rotations, which is a gain only if
+ * there is a native 32-bit rotation opcode and not a native 64-bit
+ * rotation opcode; also, interleaving implies a small overhead when
+ * processing input words).
+ *
+ * To sum up:
+ * -- when possible, use the 64-bit code
+ * -- exception: on 32-bit x86, use 32-bit code
+ * -- when using 32-bit code, use interleaving
+ * -- copy the state, except on x86
+ * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
+ */
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_SMALL_FOOTPRINT_KECCAK   1
+#endif
+
+/*
+ * By default, we select the 64-bit implementation if a 64-bit type
+ * is available, unless a 32-bit x86 is detected.
+ */
+#if !defined SPH_KECCAK_64 && SPH_64 \
+	&& !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC)
+#define SPH_KECCAK_64   1
+#endif
+
+/*
+ * If using a 32-bit implementation, we prefer to interleave.
+ */
+#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE
+#define SPH_KECCAK_INTERLEAVE   1
+#endif
+
+/*
+ * Unroll 8 rounds on big systems, 2 rounds on small systems.
+ */
+#ifndef SPH_KECCAK_UNROLL
+#if SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_KECCAK_UNROLL   2
+#else
+#define SPH_KECCAK_UNROLL   8
+#endif
+#endif
+
+/*
+ * We do not want to copy the state to local variables on x86 (32-bit
+ * and 64-bit alike).
+ */
+#ifndef SPH_KECCAK_NOCOPY
+#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC
+#define SPH_KECCAK_NOCOPY   1
+#else
+#define SPH_KECCAK_NOCOPY   0
+#endif
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_KECCAK_64
+
+static const sph_u64 RC[] = {
+	SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+	SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+	SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+	SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+	SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+	SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+	SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+	SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+	SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+	SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00   (kc->u.wide[ 0])
+#define a10   (kc->u.wide[ 1])
+#define a20   (kc->u.wide[ 2])
+#define a30   (kc->u.wide[ 3])
+#define a40   (kc->u.wide[ 4])
+#define a01   (kc->u.wide[ 5])
+#define a11   (kc->u.wide[ 6])
+#define a21   (kc->u.wide[ 7])
+#define a31   (kc->u.wide[ 8])
+#define a41   (kc->u.wide[ 9])
+#define a02   (kc->u.wide[10])
+#define a12   (kc->u.wide[11])
+#define a22   (kc->u.wide[12])
+#define a32   (kc->u.wide[13])
+#define a42   (kc->u.wide[14])
+#define a03   (kc->u.wide[15])
+#define a13   (kc->u.wide[16])
+#define a23   (kc->u.wide[17])
+#define a33   (kc->u.wide[18])
+#define a43   (kc->u.wide[19])
+#define a04   (kc->u.wide[20])
+#define a14   (kc->u.wide[21])
+#define a24   (kc->u.wide[22])
+#define a34   (kc->u.wide[23])
+#define a44   (kc->u.wide[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u64 a00, a01, a02, a03, a04; \
+	sph_u64 a10, a11, a12, a13, a14; \
+	sph_u64 a20, a21, a22, a23, a24; \
+	sph_u64 a30, a31, a32, a33, a34; \
+	sph_u64 a40, a41, a42, a43, a44;
+
+#define READ_STATE(state)   do { \
+		a00 = (state)->u.wide[ 0]; \
+		a10 = (state)->u.wide[ 1]; \
+		a20 = (state)->u.wide[ 2]; \
+		a30 = (state)->u.wide[ 3]; \
+		a40 = (state)->u.wide[ 4]; \
+		a01 = (state)->u.wide[ 5]; \
+		a11 = (state)->u.wide[ 6]; \
+		a21 = (state)->u.wide[ 7]; \
+		a31 = (state)->u.wide[ 8]; \
+		a41 = (state)->u.wide[ 9]; \
+		a02 = (state)->u.wide[10]; \
+		a12 = (state)->u.wide[11]; \
+		a22 = (state)->u.wide[12]; \
+		a32 = (state)->u.wide[13]; \
+		a42 = (state)->u.wide[14]; \
+		a03 = (state)->u.wide[15]; \
+		a13 = (state)->u.wide[16]; \
+		a23 = (state)->u.wide[17]; \
+		a33 = (state)->u.wide[18]; \
+		a43 = (state)->u.wide[19]; \
+		a04 = (state)->u.wide[20]; \
+		a14 = (state)->u.wide[21]; \
+		a24 = (state)->u.wide[22]; \
+		a34 = (state)->u.wide[23]; \
+		a44 = (state)->u.wide[24]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.wide[ 0] = a00; \
+		(state)->u.wide[ 1] = a10; \
+		(state)->u.wide[ 2] = a20; \
+		(state)->u.wide[ 3] = a30; \
+		(state)->u.wide[ 4] = a40; \
+		(state)->u.wide[ 5] = a01; \
+		(state)->u.wide[ 6] = a11; \
+		(state)->u.wide[ 7] = a21; \
+		(state)->u.wide[ 8] = a31; \
+		(state)->u.wide[ 9] = a41; \
+		(state)->u.wide[10] = a02; \
+		(state)->u.wide[11] = a12; \
+		(state)->u.wide[12] = a22; \
+		(state)->u.wide[13] = a32; \
+		(state)->u.wide[14] = a42; \
+		(state)->u.wide[15] = a03; \
+		(state)->u.wide[16] = a13; \
+		(state)->u.wide[17] = a23; \
+		(state)->u.wide[18] = a33; \
+		(state)->u.wide[19] = a43; \
+		(state)->u.wide[20] = a04; \
+		(state)->u.wide[21] = a14; \
+		(state)->u.wide[22] = a24; \
+		(state)->u.wide[23] = a34; \
+		(state)->u.wide[24] = a44; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		if ((lim) == 72) \
+			break; \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		if ((lim) == 104) \
+			break; \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		if ((lim) == 136) \
+			break; \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = a ^ b)
+#define AND64(d, a, b)   (d = a & b)
+#define OR64(d, a, b)    (d = a | b)
+#define NOT64(d, s)      (d = SPH_T64(~s))
+#define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
+#define XOR64_IOTA       XOR64
+
+#else
+
+static const struct {
+	sph_u32 high, low;
+} RC[] = {
+#if SPH_KECCAK_INTERLEAVE
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000089), SPH_C32(0x00000000) },
+	{ SPH_C32(0x8000008B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008080), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000008B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x0000000B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000000A), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000808B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000000B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000008A), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80000008), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00000083), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000088), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008082), SPH_C32(0x00000000) }
+#else
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00008082) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000808A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008000) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000808B) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000008A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00000088) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000808B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000008B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008089) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008003) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008002) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00000080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000800A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008008) }
+#endif
+};
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define INTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#define UNINTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#else
+
+#define INTERLEAVE(l, h)
+#define UNINTERLEAVE(l, h)
+
+#endif
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00l   (kc->u.narrow[2 *  0 + 0])
+#define a00h   (kc->u.narrow[2 *  0 + 1])
+#define a10l   (kc->u.narrow[2 *  1 + 0])
+#define a10h   (kc->u.narrow[2 *  1 + 1])
+#define a20l   (kc->u.narrow[2 *  2 + 0])
+#define a20h   (kc->u.narrow[2 *  2 + 1])
+#define a30l   (kc->u.narrow[2 *  3 + 0])
+#define a30h   (kc->u.narrow[2 *  3 + 1])
+#define a40l   (kc->u.narrow[2 *  4 + 0])
+#define a40h   (kc->u.narrow[2 *  4 + 1])
+#define a01l   (kc->u.narrow[2 *  5 + 0])
+#define a01h   (kc->u.narrow[2 *  5 + 1])
+#define a11l   (kc->u.narrow[2 *  6 + 0])
+#define a11h   (kc->u.narrow[2 *  6 + 1])
+#define a21l   (kc->u.narrow[2 *  7 + 0])
+#define a21h   (kc->u.narrow[2 *  7 + 1])
+#define a31l   (kc->u.narrow[2 *  8 + 0])
+#define a31h   (kc->u.narrow[2 *  8 + 1])
+#define a41l   (kc->u.narrow[2 *  9 + 0])
+#define a41h   (kc->u.narrow[2 *  9 + 1])
+#define a02l   (kc->u.narrow[2 * 10 + 0])
+#define a02h   (kc->u.narrow[2 * 10 + 1])
+#define a12l   (kc->u.narrow[2 * 11 + 0])
+#define a12h   (kc->u.narrow[2 * 11 + 1])
+#define a22l   (kc->u.narrow[2 * 12 + 0])
+#define a22h   (kc->u.narrow[2 * 12 + 1])
+#define a32l   (kc->u.narrow[2 * 13 + 0])
+#define a32h   (kc->u.narrow[2 * 13 + 1])
+#define a42l   (kc->u.narrow[2 * 14 + 0])
+#define a42h   (kc->u.narrow[2 * 14 + 1])
+#define a03l   (kc->u.narrow[2 * 15 + 0])
+#define a03h   (kc->u.narrow[2 * 15 + 1])
+#define a13l   (kc->u.narrow[2 * 16 + 0])
+#define a13h   (kc->u.narrow[2 * 16 + 1])
+#define a23l   (kc->u.narrow[2 * 17 + 0])
+#define a23h   (kc->u.narrow[2 * 17 + 1])
+#define a33l   (kc->u.narrow[2 * 18 + 0])
+#define a33h   (kc->u.narrow[2 * 18 + 1])
+#define a43l   (kc->u.narrow[2 * 19 + 0])
+#define a43h   (kc->u.narrow[2 * 19 + 1])
+#define a04l   (kc->u.narrow[2 * 20 + 0])
+#define a04h   (kc->u.narrow[2 * 20 + 1])
+#define a14l   (kc->u.narrow[2 * 21 + 0])
+#define a14h   (kc->u.narrow[2 * 21 + 1])
+#define a24l   (kc->u.narrow[2 * 22 + 0])
+#define a24h   (kc->u.narrow[2 * 22 + 1])
+#define a34l   (kc->u.narrow[2 * 23 + 0])
+#define a34h   (kc->u.narrow[2 * 23 + 1])
+#define a44l   (kc->u.narrow[2 * 24 + 0])
+#define a44h   (kc->u.narrow[2 * 24 + 1])
+
+#define DECL_STATE
+#define READ_STATE(state)
+#define WRITE_STATE(state)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			sph_u32 tl, th; \
+			tl = sph_dec32le_aligned(buf + j + 0); \
+			th = sph_dec32le_aligned(buf + j + 4); \
+			INTERLEAVE(tl, th); \
+			kc->u.narrow[(j >> 2) + 0] ^= tl; \
+			kc->u.narrow[(j >> 2) + 1] ^= th; \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \
+	sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \
+	sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \
+	sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \
+	sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h;
+
+#define READ_STATE(state)   do { \
+		a00l = (state)->u.narrow[2 *  0 + 0]; \
+		a00h = (state)->u.narrow[2 *  0 + 1]; \
+		a10l = (state)->u.narrow[2 *  1 + 0]; \
+		a10h = (state)->u.narrow[2 *  1 + 1]; \
+		a20l = (state)->u.narrow[2 *  2 + 0]; \
+		a20h = (state)->u.narrow[2 *  2 + 1]; \
+		a30l = (state)->u.narrow[2 *  3 + 0]; \
+		a30h = (state)->u.narrow[2 *  3 + 1]; \
+		a40l = (state)->u.narrow[2 *  4 + 0]; \
+		a40h = (state)->u.narrow[2 *  4 + 1]; \
+		a01l = (state)->u.narrow[2 *  5 + 0]; \
+		a01h = (state)->u.narrow[2 *  5 + 1]; \
+		a11l = (state)->u.narrow[2 *  6 + 0]; \
+		a11h = (state)->u.narrow[2 *  6 + 1]; \
+		a21l = (state)->u.narrow[2 *  7 + 0]; \
+		a21h = (state)->u.narrow[2 *  7 + 1]; \
+		a31l = (state)->u.narrow[2 *  8 + 0]; \
+		a31h = (state)->u.narrow[2 *  8 + 1]; \
+		a41l = (state)->u.narrow[2 *  9 + 0]; \
+		a41h = (state)->u.narrow[2 *  9 + 1]; \
+		a02l = (state)->u.narrow[2 * 10 + 0]; \
+		a02h = (state)->u.narrow[2 * 10 + 1]; \
+		a12l = (state)->u.narrow[2 * 11 + 0]; \
+		a12h = (state)->u.narrow[2 * 11 + 1]; \
+		a22l = (state)->u.narrow[2 * 12 + 0]; \
+		a22h = (state)->u.narrow[2 * 12 + 1]; \
+		a32l = (state)->u.narrow[2 * 13 + 0]; \
+		a32h = (state)->u.narrow[2 * 13 + 1]; \
+		a42l = (state)->u.narrow[2 * 14 + 0]; \
+		a42h = (state)->u.narrow[2 * 14 + 1]; \
+		a03l = (state)->u.narrow[2 * 15 + 0]; \
+		a03h = (state)->u.narrow[2 * 15 + 1]; \
+		a13l = (state)->u.narrow[2 * 16 + 0]; \
+		a13h = (state)->u.narrow[2 * 16 + 1]; \
+		a23l = (state)->u.narrow[2 * 17 + 0]; \
+		a23h = (state)->u.narrow[2 * 17 + 1]; \
+		a33l = (state)->u.narrow[2 * 18 + 0]; \
+		a33h = (state)->u.narrow[2 * 18 + 1]; \
+		a43l = (state)->u.narrow[2 * 19 + 0]; \
+		a43h = (state)->u.narrow[2 * 19 + 1]; \
+		a04l = (state)->u.narrow[2 * 20 + 0]; \
+		a04h = (state)->u.narrow[2 * 20 + 1]; \
+		a14l = (state)->u.narrow[2 * 21 + 0]; \
+		a14h = (state)->u.narrow[2 * 21 + 1]; \
+		a24l = (state)->u.narrow[2 * 22 + 0]; \
+		a24h = (state)->u.narrow[2 * 22 + 1]; \
+		a34l = (state)->u.narrow[2 * 23 + 0]; \
+		a34h = (state)->u.narrow[2 * 23 + 1]; \
+		a44l = (state)->u.narrow[2 * 24 + 0]; \
+		a44h = (state)->u.narrow[2 * 24 + 1]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.narrow[2 *  0 + 0] = a00l; \
+		(state)->u.narrow[2 *  0 + 1] = a00h; \
+		(state)->u.narrow[2 *  1 + 0] = a10l; \
+		(state)->u.narrow[2 *  1 + 1] = a10h; \
+		(state)->u.narrow[2 *  2 + 0] = a20l; \
+		(state)->u.narrow[2 *  2 + 1] = a20h; \
+		(state)->u.narrow[2 *  3 + 0] = a30l; \
+		(state)->u.narrow[2 *  3 + 1] = a30h; \
+		(state)->u.narrow[2 *  4 + 0] = a40l; \
+		(state)->u.narrow[2 *  4 + 1] = a40h; \
+		(state)->u.narrow[2 *  5 + 0] = a01l; \
+		(state)->u.narrow[2 *  5 + 1] = a01h; \
+		(state)->u.narrow[2 *  6 + 0] = a11l; \
+		(state)->u.narrow[2 *  6 + 1] = a11h; \
+		(state)->u.narrow[2 *  7 + 0] = a21l; \
+		(state)->u.narrow[2 *  7 + 1] = a21h; \
+		(state)->u.narrow[2 *  8 + 0] = a31l; \
+		(state)->u.narrow[2 *  8 + 1] = a31h; \
+		(state)->u.narrow[2 *  9 + 0] = a41l; \
+		(state)->u.narrow[2 *  9 + 1] = a41h; \
+		(state)->u.narrow[2 * 10 + 0] = a02l; \
+		(state)->u.narrow[2 * 10 + 1] = a02h; \
+		(state)->u.narrow[2 * 11 + 0] = a12l; \
+		(state)->u.narrow[2 * 11 + 1] = a12h; \
+		(state)->u.narrow[2 * 12 + 0] = a22l; \
+		(state)->u.narrow[2 * 12 + 1] = a22h; \
+		(state)->u.narrow[2 * 13 + 0] = a32l; \
+		(state)->u.narrow[2 * 13 + 1] = a32h; \
+		(state)->u.narrow[2 * 14 + 0] = a42l; \
+		(state)->u.narrow[2 * 14 + 1] = a42h; \
+		(state)->u.narrow[2 * 15 + 0] = a03l; \
+		(state)->u.narrow[2 * 15 + 1] = a03h; \
+		(state)->u.narrow[2 * 16 + 0] = a13l; \
+		(state)->u.narrow[2 * 16 + 1] = a13h; \
+		(state)->u.narrow[2 * 17 + 0] = a23l; \
+		(state)->u.narrow[2 * 17 + 1] = a23h; \
+		(state)->u.narrow[2 * 18 + 0] = a33l; \
+		(state)->u.narrow[2 * 18 + 1] = a33h; \
+		(state)->u.narrow[2 * 19 + 0] = a43l; \
+		(state)->u.narrow[2 * 19 + 1] = a43h; \
+		(state)->u.narrow[2 * 20 + 0] = a04l; \
+		(state)->u.narrow[2 * 20 + 1] = a04h; \
+		(state)->u.narrow[2 * 21 + 0] = a14l; \
+		(state)->u.narrow[2 * 21 + 1] = a14h; \
+		(state)->u.narrow[2 * 22 + 0] = a24l; \
+		(state)->u.narrow[2 * 22 + 1] = a24h; \
+		(state)->u.narrow[2 * 23 + 0] = a34l; \
+		(state)->u.narrow[2 * 23 + 1] = a34h; \
+		(state)->u.narrow[2 * 24 + 0] = a44l; \
+		(state)->u.narrow[2 * 24 + 1] = a44h; \
+	} while (0)
+
+#define READ64(d, off)   do { \
+		sph_u32 tl, th; \
+		tl = sph_dec32le_aligned(buf + (off)); \
+		th = sph_dec32le_aligned(buf + (off) + 4); \
+		INTERLEAVE(tl, th); \
+		d ## l ^= tl; \
+		d ## h ^= th; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		READ64(a23, 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		if ((lim) == 72) \
+			break; \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		if ((lim) == 104) \
+			break; \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		if ((lim) == 136) \
+			break; \
+		READ64(a23, 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x ## l, x ## h
+#define MOV64(d, s)      (d ## l = s ## l, d ## h = s ## h)
+#define XOR64(d, a, b)   (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h)
+#define AND64(d, a, b)   (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h)
+#define OR64(d, a, b)    (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h)
+#define NOT64(d, s)      (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h))
+#define ROL64(d, v, n)   ROL64_ ## n(d, v)
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define ROL64_odd1(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd63(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \
+		d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_even(d, v, n)   do { \
+		d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+	} while (0)
+
+#define ROL64_0(d, v)
+#define ROL64_1(d, v)    ROL64_odd1(d, v)
+#define ROL64_2(d, v)    ROL64_even(d, v,  1)
+#define ROL64_3(d, v)    ROL64_odd( d, v,  2)
+#define ROL64_4(d, v)    ROL64_even(d, v,  2)
+#define ROL64_5(d, v)    ROL64_odd( d, v,  3)
+#define ROL64_6(d, v)    ROL64_even(d, v,  3)
+#define ROL64_7(d, v)    ROL64_odd( d, v,  4)
+#define ROL64_8(d, v)    ROL64_even(d, v,  4)
+#define ROL64_9(d, v)    ROL64_odd( d, v,  5)
+#define ROL64_10(d, v)   ROL64_even(d, v,  5)
+#define ROL64_11(d, v)   ROL64_odd( d, v,  6)
+#define ROL64_12(d, v)   ROL64_even(d, v,  6)
+#define ROL64_13(d, v)   ROL64_odd( d, v,  7)
+#define ROL64_14(d, v)   ROL64_even(d, v,  7)
+#define ROL64_15(d, v)   ROL64_odd( d, v,  8)
+#define ROL64_16(d, v)   ROL64_even(d, v,  8)
+#define ROL64_17(d, v)   ROL64_odd( d, v,  9)
+#define ROL64_18(d, v)   ROL64_even(d, v,  9)
+#define ROL64_19(d, v)   ROL64_odd( d, v, 10)
+#define ROL64_20(d, v)   ROL64_even(d, v, 10)
+#define ROL64_21(d, v)   ROL64_odd( d, v, 11)
+#define ROL64_22(d, v)   ROL64_even(d, v, 11)
+#define ROL64_23(d, v)   ROL64_odd( d, v, 12)
+#define ROL64_24(d, v)   ROL64_even(d, v, 12)
+#define ROL64_25(d, v)   ROL64_odd( d, v, 13)
+#define ROL64_26(d, v)   ROL64_even(d, v, 13)
+#define ROL64_27(d, v)   ROL64_odd( d, v, 14)
+#define ROL64_28(d, v)   ROL64_even(d, v, 14)
+#define ROL64_29(d, v)   ROL64_odd( d, v, 15)
+#define ROL64_30(d, v)   ROL64_even(d, v, 15)
+#define ROL64_31(d, v)   ROL64_odd( d, v, 16)
+#define ROL64_32(d, v)   ROL64_even(d, v, 16)
+#define ROL64_33(d, v)   ROL64_odd( d, v, 17)
+#define ROL64_34(d, v)   ROL64_even(d, v, 17)
+#define ROL64_35(d, v)   ROL64_odd( d, v, 18)
+#define ROL64_36(d, v)   ROL64_even(d, v, 18)
+#define ROL64_37(d, v)   ROL64_odd( d, v, 19)
+#define ROL64_38(d, v)   ROL64_even(d, v, 19)
+#define ROL64_39(d, v)   ROL64_odd( d, v, 20)
+#define ROL64_40(d, v)   ROL64_even(d, v, 20)
+#define ROL64_41(d, v)   ROL64_odd( d, v, 21)
+#define ROL64_42(d, v)   ROL64_even(d, v, 21)
+#define ROL64_43(d, v)   ROL64_odd( d, v, 22)
+#define ROL64_44(d, v)   ROL64_even(d, v, 22)
+#define ROL64_45(d, v)   ROL64_odd( d, v, 23)
+#define ROL64_46(d, v)   ROL64_even(d, v, 23)
+#define ROL64_47(d, v)   ROL64_odd( d, v, 24)
+#define ROL64_48(d, v)   ROL64_even(d, v, 24)
+#define ROL64_49(d, v)   ROL64_odd( d, v, 25)
+#define ROL64_50(d, v)   ROL64_even(d, v, 25)
+#define ROL64_51(d, v)   ROL64_odd( d, v, 26)
+#define ROL64_52(d, v)   ROL64_even(d, v, 26)
+#define ROL64_53(d, v)   ROL64_odd( d, v, 27)
+#define ROL64_54(d, v)   ROL64_even(d, v, 27)
+#define ROL64_55(d, v)   ROL64_odd( d, v, 28)
+#define ROL64_56(d, v)   ROL64_even(d, v, 28)
+#define ROL64_57(d, v)   ROL64_odd( d, v, 29)
+#define ROL64_58(d, v)   ROL64_even(d, v, 29)
+#define ROL64_59(d, v)   ROL64_odd( d, v, 30)
+#define ROL64_60(d, v)   ROL64_even(d, v, 30)
+#define ROL64_61(d, v)   ROL64_odd( d, v, 31)
+#define ROL64_62(d, v)   ROL64_even(d, v, 31)
+#define ROL64_63(d, v)   ROL64_odd63(d, v)
+
+#else
+
+#define ROL64_small(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \
+		d ## l = tmp; \
+	} while (0)
+
+#define ROL64_0(d, v)    0
+#define ROL64_1(d, v)    ROL64_small(d, v, 1)
+#define ROL64_2(d, v)    ROL64_small(d, v, 2)
+#define ROL64_3(d, v)    ROL64_small(d, v, 3)
+#define ROL64_4(d, v)    ROL64_small(d, v, 4)
+#define ROL64_5(d, v)    ROL64_small(d, v, 5)
+#define ROL64_6(d, v)    ROL64_small(d, v, 6)
+#define ROL64_7(d, v)    ROL64_small(d, v, 7)
+#define ROL64_8(d, v)    ROL64_small(d, v, 8)
+#define ROL64_9(d, v)    ROL64_small(d, v, 9)
+#define ROL64_10(d, v)   ROL64_small(d, v, 10)
+#define ROL64_11(d, v)   ROL64_small(d, v, 11)
+#define ROL64_12(d, v)   ROL64_small(d, v, 12)
+#define ROL64_13(d, v)   ROL64_small(d, v, 13)
+#define ROL64_14(d, v)   ROL64_small(d, v, 14)
+#define ROL64_15(d, v)   ROL64_small(d, v, 15)
+#define ROL64_16(d, v)   ROL64_small(d, v, 16)
+#define ROL64_17(d, v)   ROL64_small(d, v, 17)
+#define ROL64_18(d, v)   ROL64_small(d, v, 18)
+#define ROL64_19(d, v)   ROL64_small(d, v, 19)
+#define ROL64_20(d, v)   ROL64_small(d, v, 20)
+#define ROL64_21(d, v)   ROL64_small(d, v, 21)
+#define ROL64_22(d, v)   ROL64_small(d, v, 22)
+#define ROL64_23(d, v)   ROL64_small(d, v, 23)
+#define ROL64_24(d, v)   ROL64_small(d, v, 24)
+#define ROL64_25(d, v)   ROL64_small(d, v, 25)
+#define ROL64_26(d, v)   ROL64_small(d, v, 26)
+#define ROL64_27(d, v)   ROL64_small(d, v, 27)
+#define ROL64_28(d, v)   ROL64_small(d, v, 28)
+#define ROL64_29(d, v)   ROL64_small(d, v, 29)
+#define ROL64_30(d, v)   ROL64_small(d, v, 30)
+#define ROL64_31(d, v)   ROL64_small(d, v, 31)
+
+#define ROL64_32(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_big(d, v, n)   do { \
+		sph_u32 trl, trh; \
+		ROL64_small(tr, v, n); \
+		d ## h = trl; \
+		d ## l = trh; \
+	} while (0)
+
+#define ROL64_33(d, v)   ROL64_big(d, v, 1)
+#define ROL64_34(d, v)   ROL64_big(d, v, 2)
+#define ROL64_35(d, v)   ROL64_big(d, v, 3)
+#define ROL64_36(d, v)   ROL64_big(d, v, 4)
+#define ROL64_37(d, v)   ROL64_big(d, v, 5)
+#define ROL64_38(d, v)   ROL64_big(d, v, 6)
+#define ROL64_39(d, v)   ROL64_big(d, v, 7)
+#define ROL64_40(d, v)   ROL64_big(d, v, 8)
+#define ROL64_41(d, v)   ROL64_big(d, v, 9)
+#define ROL64_42(d, v)   ROL64_big(d, v, 10)
+#define ROL64_43(d, v)   ROL64_big(d, v, 11)
+#define ROL64_44(d, v)   ROL64_big(d, v, 12)
+#define ROL64_45(d, v)   ROL64_big(d, v, 13)
+#define ROL64_46(d, v)   ROL64_big(d, v, 14)
+#define ROL64_47(d, v)   ROL64_big(d, v, 15)
+#define ROL64_48(d, v)   ROL64_big(d, v, 16)
+#define ROL64_49(d, v)   ROL64_big(d, v, 17)
+#define ROL64_50(d, v)   ROL64_big(d, v, 18)
+#define ROL64_51(d, v)   ROL64_big(d, v, 19)
+#define ROL64_52(d, v)   ROL64_big(d, v, 20)
+#define ROL64_53(d, v)   ROL64_big(d, v, 21)
+#define ROL64_54(d, v)   ROL64_big(d, v, 22)
+#define ROL64_55(d, v)   ROL64_big(d, v, 23)
+#define ROL64_56(d, v)   ROL64_big(d, v, 24)
+#define ROL64_57(d, v)   ROL64_big(d, v, 25)
+#define ROL64_58(d, v)   ROL64_big(d, v, 26)
+#define ROL64_59(d, v)   ROL64_big(d, v, 27)
+#define ROL64_60(d, v)   ROL64_big(d, v, 28)
+#define ROL64_61(d, v)   ROL64_big(d, v, 29)
+#define ROL64_62(d, v)   ROL64_big(d, v, 30)
+#define ROL64_63(d, v)   ROL64_big(d, v, 31)
+
+#endif
+
+#define XOR64_IOTA(d, s, k) \
+	(d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high)
+
+#endif
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+		DECL64(tt0); \
+		DECL64(tt1); \
+		DECL64(tt2); \
+		DECL64(tt3); \
+		XOR64(tt0, d0, d1); \
+		XOR64(tt1, d2, d3); \
+		XOR64(tt0, tt0, d4); \
+		XOR64(tt0, tt0, tt1); \
+		ROL64(tt0, tt0, 1); \
+		XOR64(tt2, c0, c1); \
+		XOR64(tt3, c2, c3); \
+		XOR64(tt0, tt0, c4); \
+		XOR64(tt2, tt2, tt3); \
+		XOR64(t, tt0, tt2); \
+	} while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(t0); \
+		DECL64(t1); \
+		DECL64(t2); \
+		DECL64(t3); \
+		DECL64(t4); \
+		TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+		TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+		TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+		TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+		TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+		XOR64(b00, b00, t0); \
+		XOR64(b01, b01, t0); \
+		XOR64(b02, b02, t0); \
+		XOR64(b03, b03, t0); \
+		XOR64(b04, b04, t0); \
+		XOR64(b10, b10, t1); \
+		XOR64(b11, b11, t1); \
+		XOR64(b12, b12, t1); \
+		XOR64(b13, b13, t1); \
+		XOR64(b14, b14, t1); \
+		XOR64(b20, b20, t2); \
+		XOR64(b21, b21, t2); \
+		XOR64(b22, b22, t2); \
+		XOR64(b23, b23, t2); \
+		XOR64(b24, b24, t2); \
+		XOR64(b30, b30, t3); \
+		XOR64(b31, b31, t3); \
+		XOR64(b32, b32, t3); \
+		XOR64(b33, b33, t3); \
+		XOR64(b34, b34, t3); \
+		XOR64(b40, b40, t4); \
+		XOR64(b41, b41, t4); \
+		XOR64(b42, b42, t4); \
+		XOR64(b43, b43, t4); \
+		XOR64(b44, b44, t4); \
+	} while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		/* ROL64(b00, b00,  0); */ \
+		ROL64(b01, b01, 36); \
+		ROL64(b02, b02,  3); \
+		ROL64(b03, b03, 41); \
+		ROL64(b04, b04, 18); \
+		ROL64(b10, b10,  1); \
+		ROL64(b11, b11, 44); \
+		ROL64(b12, b12, 10); \
+		ROL64(b13, b13, 45); \
+		ROL64(b14, b14,  2); \
+		ROL64(b20, b20, 62); \
+		ROL64(b21, b21,  6); \
+		ROL64(b22, b22, 43); \
+		ROL64(b23, b23, 15); \
+		ROL64(b24, b24, 61); \
+		ROL64(b30, b30, 28); \
+		ROL64(b31, b31, 55); \
+		ROL64(b32, b32, 25); \
+		ROL64(b33, b33, 21); \
+		ROL64(b34, b34, 56); \
+		ROL64(b40, b40, 27); \
+		ROL64(b41, b41, 20); \
+		ROL64(b42, b42, 39); \
+		ROL64(b43, b43,  8); \
+		ROL64(b44, b44, 14); \
+	} while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+		DECL64(kt); \
+		OR64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+		DECL64(kt); \
+		AND64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(c0); \
+		DECL64(c1); \
+		DECL64(c2); \
+		DECL64(c3); \
+		DECL64(c4); \
+		DECL64(bnn); \
+		NOT64(bnn, b20); \
+		KHI_XO(c0, b00, b10, b20); \
+		KHI_XO(c1, b10, bnn, b30); \
+		KHI_XA(c2, b20, b30, b40); \
+		KHI_XO(c3, b30, b40, b00); \
+		KHI_XA(c4, b40, b00, b10); \
+		MOV64(b00, c0); \
+		MOV64(b10, c1); \
+		MOV64(b20, c2); \
+		MOV64(b30, c3); \
+		MOV64(b40, c4); \
+		NOT64(bnn, b41); \
+		KHI_XO(c0, b01, b11, b21); \
+		KHI_XA(c1, b11, b21, b31); \
+		KHI_XO(c2, b21, b31, bnn); \
+		KHI_XO(c3, b31, b41, b01); \
+		KHI_XA(c4, b41, b01, b11); \
+		MOV64(b01, c0); \
+		MOV64(b11, c1); \
+		MOV64(b21, c2); \
+		MOV64(b31, c3); \
+		MOV64(b41, c4); \
+		NOT64(bnn, b32); \
+		KHI_XO(c0, b02, b12, b22); \
+		KHI_XA(c1, b12, b22, b32); \
+		KHI_XA(c2, b22, bnn, b42); \
+		KHI_XO(c3, bnn, b42, b02); \
+		KHI_XA(c4, b42, b02, b12); \
+		MOV64(b02, c0); \
+		MOV64(b12, c1); \
+		MOV64(b22, c2); \
+		MOV64(b32, c3); \
+		MOV64(b42, c4); \
+		NOT64(bnn, b33); \
+		KHI_XA(c0, b03, b13, b23); \
+		KHI_XO(c1, b13, b23, b33); \
+		KHI_XO(c2, b23, bnn, b43); \
+		KHI_XA(c3, bnn, b43, b03); \
+		KHI_XO(c4, b43, b03, b13); \
+		MOV64(b03, c0); \
+		MOV64(b13, c1); \
+		MOV64(b23, c2); \
+		MOV64(b33, c3); \
+		MOV64(b43, c4); \
+		NOT64(bnn, b14); \
+		KHI_XA(c0, b04, bnn, b24); \
+		KHI_XO(c1, bnn, b24, b34); \
+		KHI_XA(c2, b24, b34, b44); \
+		KHI_XO(c3, b34, b44, b04); \
+		KHI_XA(c4, b44, b04, b14); \
+		MOV64(b04, c0); \
+		MOV64(b14, c1); \
+		MOV64(b24, c2); \
+		MOV64(b34, c3); \
+		MOV64(b44, c4); \
+	} while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P1_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a30); \
+		MOV64(a30, a33); \
+		MOV64(a33, a23); \
+		MOV64(a23, a12); \
+		MOV64(a12, a21); \
+		MOV64(a21, a02); \
+		MOV64(a02, a10); \
+		MOV64(a10, a11); \
+		MOV64(a11, a41); \
+		MOV64(a41, a24); \
+		MOV64(a24, a42); \
+		MOV64(a42, a04); \
+		MOV64(a04, a20); \
+		MOV64(a20, a22); \
+		MOV64(a22, a32); \
+		MOV64(a32, a43); \
+		MOV64(a43, a34); \
+		MOV64(a34, a03); \
+		MOV64(a03, a40); \
+		MOV64(a40, a44); \
+		MOV64(a44, a14); \
+		MOV64(a14, a31); \
+		MOV64(a31, a13); \
+		MOV64(a13, t); \
+	} while (0)
+
+#define P2_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a33); \
+		MOV64(a33, a12); \
+		MOV64(a12, a02); \
+		MOV64(a02, a11); \
+		MOV64(a11, a24); \
+		MOV64(a24, a04); \
+		MOV64(a04, a22); \
+		MOV64(a22, a43); \
+		MOV64(a43, a03); \
+		MOV64(a03, a44); \
+		MOV64(a44, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a41); \
+		MOV64(a41, a42); \
+		MOV64(a42, a20); \
+		MOV64(a20, a32); \
+		MOV64(a32, a34); \
+		MOV64(a34, a40); \
+		MOV64(a40, a14); \
+		MOV64(a14, a13); \
+		MOV64(a13, a30); \
+		MOV64(a30, a23); \
+		MOV64(a23, a21); \
+		MOV64(a21, t); \
+	} while (0)
+
+#define P4_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a12); \
+		MOV64(a12, a11); \
+		MOV64(a11, a04); \
+		MOV64(a04, a43); \
+		MOV64(a43, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a24); \
+		MOV64(a24, a22); \
+		MOV64(a22, a03); \
+		MOV64(a03, a31); \
+		MOV64(a31, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a42); \
+		MOV64(a42, a32); \
+		MOV64(a32, a40); \
+		MOV64(a40, a13); \
+		MOV64(a13, a23); \
+		MOV64(a23, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a30); \
+		MOV64(a30, a21); \
+		MOV64(a21, a41); \
+		MOV64(a41, a20); \
+		MOV64(a20, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P6_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a02); \
+		MOV64(a02, a04); \
+		MOV64(a04, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a20); \
+		MOV64(a20, a40); \
+		MOV64(a40, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a22); \
+		MOV64(a22, a44); \
+		MOV64(a44, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a24); \
+		MOV64(a24, a43); \
+		MOV64(a43, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a21); \
+		MOV64(a21, a42); \
+		MOV64(a42, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a23); \
+		MOV64(a23, a41); \
+		MOV64(a41, a32); \
+		MOV64(a32, t); \
+	} while (0)
+
+#define P8_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a11); \
+		MOV64(a11, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a22); \
+		MOV64(a22, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a03); \
+		MOV64(a03, a33); \
+		MOV64(a33, a24); \
+		MOV64(a24, t); \
+		MOV64(t, a04); \
+		MOV64(a04, a44); \
+		MOV64(a44, a12); \
+		MOV64(a12, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a32); \
+		MOV64(a32, a13); \
+		MOV64(a13, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a21); \
+		MOV64(a21, a20); \
+		MOV64(a20, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a42); \
+		MOV64(a42, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a30); \
+		MOV64(a30, a41); \
+		MOV64(a41, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P12_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a04); \
+		MOV64(a04, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a42); \
+		MOV64(a42, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a41); \
+		MOV64(a41, t); \
+		MOV64(t, a20); \
+		MOV64(a20, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a21); \
+		MOV64(a21, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a22); \
+		MOV64(a22, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a32); \
+		MOV64(a32, t); \
+		MOV64(t, a24); \
+		MOV64(a24, a31); \
+		MOV64(a31, t); \
+	} while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+		THETA LPAR P ## r RPAR; \
+		RHO LPAR P ## r RPAR; \
+		KHI LPAR P ## s RPAR; \
+		IOTA(k); \
+	} while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#if SPH_KECCAK_UNROLL == 1
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j ++) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			P1_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 2
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 2) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			P2_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 4
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 4) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			P4_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 6
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 6) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			P6_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 8
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 8) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			KF_ELT( 6,  7, RC[j + 6]); \
+			KF_ELT( 7,  8, RC[j + 7]); \
+			P8_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 12
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 12) { \
+			KF_ELT( 0,  1, RC[j +  0]); \
+			KF_ELT( 1,  2, RC[j +  1]); \
+			KF_ELT( 2,  3, RC[j +  2]); \
+			KF_ELT( 3,  4, RC[j +  3]); \
+			KF_ELT( 4,  5, RC[j +  4]); \
+			KF_ELT( 5,  6, RC[j +  5]); \
+			KF_ELT( 6,  7, RC[j +  6]); \
+			KF_ELT( 7,  8, RC[j +  7]); \
+			KF_ELT( 8,  9, RC[j +  8]); \
+			KF_ELT( 9, 10, RC[j +  9]); \
+			KF_ELT(10, 11, RC[j + 10]); \
+			KF_ELT(11, 12, RC[j + 11]); \
+			P12_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 0
+
+#define KECCAK_F_1600_   do { \
+		KF_ELT( 0,  1, RC[ 0]); \
+		KF_ELT( 1,  2, RC[ 1]); \
+		KF_ELT( 2,  3, RC[ 2]); \
+		KF_ELT( 3,  4, RC[ 3]); \
+		KF_ELT( 4,  5, RC[ 4]); \
+		KF_ELT( 5,  6, RC[ 5]); \
+		KF_ELT( 6,  7, RC[ 6]); \
+		KF_ELT( 7,  8, RC[ 7]); \
+		KF_ELT( 8,  9, RC[ 8]); \
+		KF_ELT( 9, 10, RC[ 9]); \
+		KF_ELT(10, 11, RC[10]); \
+		KF_ELT(11, 12, RC[11]); \
+		KF_ELT(12, 13, RC[12]); \
+		KF_ELT(13, 14, RC[13]); \
+		KF_ELT(14, 15, RC[14]); \
+		KF_ELT(15, 16, RC[15]); \
+		KF_ELT(16, 17, RC[16]); \
+		KF_ELT(17, 18, RC[17]); \
+		KF_ELT(18, 19, RC[18]); \
+		KF_ELT(19, 20, RC[19]); \
+		KF_ELT(20, 21, RC[20]); \
+		KF_ELT(21, 22, RC[21]); \
+		KF_ELT(22, 23, RC[22]); \
+		KF_ELT(23,  0, RC[23]); \
+	} while (0)
+
+#else
+
+#error Unimplemented unroll count for Keccak.
+
+#endif
+
+static void
+keccak_init(sph_keccak_context *kc, unsigned out_size)
+{
+	int i;
+
+#if SPH_KECCAK_64
+	for (i = 0; i < 25; i ++)
+		kc->u.wide[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 */
+	kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+#else
+
+	for (i = 0; i < 50; i ++)
+		kc->u.narrow[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 * Note: since we set to all-one full 64-bit words,
+	 * interleaving (if applicable) is a no-op.
+	 */
+	kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[16] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[17] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[24] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[25] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[34] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[35] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[40] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[41] = SPH_C32(0xFFFFFFFF);
+#endif
+	kc->ptr = 0;
+	kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = kc->buf;
+	ptr = kc->ptr;
+
+	if (len < (lim - ptr)) {
+		memcpy(buf + ptr, data, len);
+		kc->ptr = ptr + len;
+		return;
+	}
+
+	READ_STATE(kc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (lim - ptr);
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == lim) {
+			INPUT_BUF(lim);
+			KECCAK_F_1600;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(kc);
+	kc->ptr = ptr;
+}
+
+#if SPH_KECCAK_64
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.wide[ 1] = ~kc->u.wide[ 1]; \
+		kc->u.wide[ 2] = ~kc->u.wide[ 2]; \
+		kc->u.wide[ 8] = ~kc->u.wide[ 8]; \
+		kc->u.wide[12] = ~kc->u.wide[12]; \
+		kc->u.wide[17] = ~kc->u.wide[17]; \
+		kc->u.wide[20] = ~kc->u.wide[20]; \
+		for (j = 0; j < d; j += 8) \
+			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#else
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \
+		kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \
+		kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \
+		kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \
+		kc->u.narrow[16] = ~kc->u.narrow[16]; \
+		kc->u.narrow[17] = ~kc->u.narrow[17]; \
+		kc->u.narrow[24] = ~kc->u.narrow[24]; \
+		kc->u.narrow[25] = ~kc->u.narrow[25]; \
+		kc->u.narrow[34] = ~kc->u.narrow[34]; \
+		kc->u.narrow[35] = ~kc->u.narrow[35]; \
+		kc->u.narrow[40] = ~kc->u.narrow[40]; \
+		kc->u.narrow[41] = ~kc->u.narrow[41]; \
+		/* un-interleave */ \
+		for (j = 0; j < 50; j += 2) \
+			UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \
+		for (j = 0; j < d; j += 4) \
+			sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#endif
+
+DEFCLOSE(28, 144)
+DEFCLOSE(32, 136)
+DEFCLOSE(48, 104)
+DEFCLOSE(64, 72)
+
+/* see sph_keccak.h */
+void
+sph_keccak224_init(void *cc)
+{
+	keccak_init(cc, 224);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 144);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_close(void *cc, void *dst)
+{
+	sph_keccak224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close28(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_init(void *cc)
+{
+	keccak_init(cc, 256);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 136);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_close(void *cc, void *dst)
+{
+	sph_keccak256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close32(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_init(void *cc)
+{
+	keccak_init(cc, 384);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 104);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_close(void *cc, void *dst)
+{
+	sph_keccak384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close48(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_init(void *cc)
+{
+	keccak_init(cc, 512);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 72);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_close(void *cc, void *dst)
+{
+	sph_keccak512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close64(cc, ub, n, dst);
+}
diff --git a/sph/luffa.c b/sph/luffa.c
new file mode 100644
index 00000000..64e2c907
--- /dev/null
+++ b/sph/luffa.c
@@ -0,0 +1,1418 @@
+/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */
+/*
+ * Luffa implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_luffa.h"
+
+#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL
+#define SPH_LUFFA_PARALLEL   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 V_INIT[5][8] = {
+	{
+		SPH_C32(0x6d251e69), SPH_C32(0x44b051e0),
+		SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465),
+		SPH_C32(0x6e292011), SPH_C32(0x90152df4),
+		SPH_C32(0xee058139), SPH_C32(0xdef610bb)
+	}, {
+		SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256),
+		SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3),
+		SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3),
+		SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581)
+	}, {
+		SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781),
+		SPH_C32(0x04016ce5), SPH_C32(0xad659c05),
+		SPH_C32(0x0306194f), SPH_C32(0x666d1836),
+		SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7)
+	}, {
+		SPH_C32(0x858075d5), SPH_C32(0x36d79cce),
+		SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67),
+		SPH_C32(0x35870c6a), SPH_C32(0x57e9e923),
+		SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce)
+	}, {
+		SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22),
+		SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363),
+		SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1),
+		SPH_C32(0xb07224cc), SPH_C32(0x03e86cea)
+	}
+};
+
+static const sph_u32 RC00[8] = {
+	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
+	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
+	SPH_C32(0x1e00108f), SPH_C32(0x7800423d),
+	SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12)
+};
+
+static const sph_u32 RC04[8] = {
+	SPH_C32(0xe0337818), SPH_C32(0x441ba90d),
+	SPH_C32(0x7f34d442), SPH_C32(0x9389217f),
+	SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4),
+	SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d)
+};
+
+static const sph_u32 RC10[8] = {
+	SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae),
+	SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51),
+	SPH_C32(0x707a3d45), SPH_C32(0xaeb28562),
+	SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e)
+};
+
+static const sph_u32 RC14[8] = {
+	SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4),
+	SPH_C32(0xbd09caca), SPH_C32(0xf4272b28),
+	SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b),
+	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW010[8] = {
+	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
+	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
+	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
+	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+};
+
+static const sph_u64 RCW014[8] = {
+	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
+	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
+	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
+	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+};
+
+#endif
+
+static const sph_u32 RC20[8] = {
+	SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25),
+	SPH_C32(0x7ad8818f), SPH_C32(0x8438764a),
+	SPH_C32(0xbb6de032), SPH_C32(0xedb780c8),
+	SPH_C32(0xd9847356), SPH_C32(0xa2c78434)
+};
+
+static const sph_u32 RC24[8] = {
+	SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72),
+	SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7),
+	SPH_C32(0x78e38b9d), SPH_C32(0x27586719),
+	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
+};
+
+static const sph_u32 RC30[8] = {
+	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
+	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
+	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
+	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
+};
+
+static const sph_u32 RC34[8] = {
+	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
+	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
+	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
+	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW230[8] = {
+	SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25),
+	SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a),
+	SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8),
+	SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434)
+};
+
+
+static const sph_u64 RCW234[8] = {
+	SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72),
+	SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7),
+	SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719),
+	SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7)
+};
+
+#endif
+
+static const sph_u32 RC40[8] = {
+	SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa),
+	SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9),
+	SPH_C32(0x78602649), SPH_C32(0x8edae952),
+	SPH_C32(0x3b6ba548), SPH_C32(0xedae9520)
+};
+
+static const sph_u32 RC44[8] = {
+	SPH_C32(0x5090d577), SPH_C32(0x2d1925ab),
+	SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0),
+	SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3),
+	SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31)
+};
+
+#define DECL_TMP8(w) \
+	sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7;
+
+#define M2(d, s)   do { \
+		sph_u32 tmp = s ## 7; \
+		d ## 7 = s ## 6; \
+		d ## 6 = s ## 5; \
+		d ## 5 = s ## 4; \
+		d ## 4 = s ## 3 ^ tmp; \
+		d ## 3 = s ## 2 ^ tmp; \
+		d ## 2 = s ## 1; \
+		d ## 1 = s ## 0 ^ tmp; \
+		d ## 0 = tmp; \
+	} while (0)
+
+#define XOR(d, s1, s2)   do { \
+		d ## 0 = s1 ## 0 ^ s2 ## 0; \
+		d ## 1 = s1 ## 1 ^ s2 ## 1; \
+		d ## 2 = s1 ## 2 ^ s2 ## 2; \
+		d ## 3 = s1 ## 3 ^ s2 ## 3; \
+		d ## 4 = s1 ## 4 ^ s2 ## 4; \
+		d ## 5 = s1 ## 5 ^ s2 ## 5; \
+		d ## 6 = s1 ## 6 ^ s2 ## 6; \
+		d ## 7 = s1 ## 7 ^ s2 ## 7; \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define SUB_CRUMB_GEN(a0, a1, a2, a3, width)   do { \
+		sph_u ## width tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T ## width(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T ## width(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#define SUB_CRUMB(a0, a1, a2, a3)    SUB_CRUMB_GEN(a0, a1, a2, a3, 32)
+#define SUB_CRUMBW(a0, a1, a2, a3)   SUB_CRUMB_GEN(a0, a1, a2, a3, 64)
+
+
+#if 0
+
+#define ROL32W(x, n)   SPH_T64( \
+                       (((x) << (n)) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \
+                       | (((x) >> (32 - (n))) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n))))
+
+#define MIX_WORDW(u, v)   do { \
+		(v) ^= (u); \
+		(u) = ROL32W((u), 2) ^ (v); \
+		(v) = ROL32W((v), 14) ^ (u); \
+		(u) = ROL32W((u), 10) ^ (v); \
+		(v) = ROL32W((v), 1); \
+	} while (0)
+
+#endif
+
+#define MIX_WORDW(u, v)   do { \
+		sph_u32 ul, uh, vl, vh; \
+		(v) ^= (u); \
+		ul = SPH_T32((sph_u32)(u)); \
+		uh = SPH_T32((sph_u32)((u) >> 32)); \
+		vl = SPH_T32((sph_u32)(v)); \
+		vh = SPH_T32((sph_u32)((v) >> 32)); \
+		ul = SPH_ROTL32(ul, 2) ^ vl; \
+		vl = SPH_ROTL32(vl, 14) ^ ul; \
+		ul = SPH_ROTL32(ul, 10) ^ vl; \
+		vl = SPH_ROTL32(vl, 1); \
+		uh = SPH_ROTL32(uh, 2) ^ vh; \
+		vh = SPH_ROTL32(vh, 14) ^ uh; \
+		uh = SPH_ROTL32(uh, 10) ^ vh; \
+		vh = SPH_ROTL32(vh, 1); \
+		(u) = (sph_u64)ul | ((sph_u64)uh << 32); \
+		(v) = (sph_u64)vl | ((sph_u64)vh << 32); \
+	} while (0)
+
+#else
+
+#define SUB_CRUMB(a0, a1, a2, a3)   do { \
+		sph_u32 tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T32(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T32(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#endif
+
+#define MIX_WORD(u, v)   do { \
+		(v) ^= (u); \
+		(u) = SPH_ROTL32((u), 2) ^ (v); \
+		(v) = SPH_ROTL32((v), 14) ^ (u); \
+		(u) = SPH_ROTL32((u), 10) ^ (v); \
+		(v) = SPH_ROTL32((v), 1); \
+	} while (0)
+
+#define DECL_STATE3 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27;
+
+#define READ_STATE3(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+	} while (0)
+
+#define WRITE_STATE3(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+	} while (0)
+
+#define MI3   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(a, a, V2); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V0, M, V0); \
+		M2(M, M); \
+		XOR(V1, a, V1); \
+		XOR(V1, M, V1); \
+		M2(M, M); \
+		XOR(V2, a, V2); \
+		XOR(V2, M, V2); \
+	} while (0)
+
+#define TWEAK3   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P3   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK3; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P3   do { \
+		int r; \
+		TWEAK3; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE4 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37;
+
+#define READ_STATE4(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+	} while (0)
+
+#define WRITE_STATE4(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+	} while (0)
+
+#define MI4   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		M2(b, V0); \
+		XOR(b, b, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V0); \
+		XOR(V0, b, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+	} while (0)
+
+#define TWEAK4   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P4   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK4; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+	} while (0)
+
+#else
+
+#define P4   do { \
+		int r; \
+		TWEAK4; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE5 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \
+	sph_u32 V40, V41, V42, V43, V44, V45, V46, V47;
+
+#define READ_STATE5(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+		V40 = (state)->V[4][0]; \
+		V41 = (state)->V[4][1]; \
+		V42 = (state)->V[4][2]; \
+		V43 = (state)->V[4][3]; \
+		V44 = (state)->V[4][4]; \
+		V45 = (state)->V[4][5]; \
+		V46 = (state)->V[4][6]; \
+		V47 = (state)->V[4][7]; \
+	} while (0)
+
+#define WRITE_STATE5(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+		(state)->V[4][0] = V40; \
+		(state)->V[4][1] = V41; \
+		(state)->V[4][2] = V42; \
+		(state)->V[4][3] = V43; \
+		(state)->V[4][4] = V44; \
+		(state)->V[4][5] = V45; \
+		(state)->V[4][6] = V46; \
+		(state)->V[4][7] = V47; \
+	} while (0)
+
+#define MI5   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		XOR(a, a, V4); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		XOR(V4, a, V4); \
+		M2(b, V0); \
+		XOR(b, b, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V0); \
+		M2(V0, b); \
+		XOR(V0, V0, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, b); \
+		XOR(V0, V0, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+		M2(M, M); \
+		XOR(V4, V4, M); \
+	} while (0)
+
+#define TWEAK5   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+		V44 = SPH_ROTL32(V44, 4); \
+		V45 = SPH_ROTL32(V45, 4); \
+		V46 = SPH_ROTL32(V46, 4); \
+		V47 = SPH_ROTL32(V47, 4); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P5   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK5; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P5   do { \
+		int r; \
+		TWEAK5; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#endif
+
+static void
+luffa3(sph_luffa224_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE3(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI3;
+			P3;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE3(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE3(sc);
+	for (i = 0; i < 2; i ++) {
+		MI3;
+		P3;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	out = dst;
+	sph_enc32be(out +  0, V00 ^ V10 ^ V20);
+	sph_enc32be(out +  4, V01 ^ V11 ^ V21);
+	sph_enc32be(out +  8, V02 ^ V12 ^ V22);
+	sph_enc32be(out + 12, V03 ^ V13 ^ V23);
+	sph_enc32be(out + 16, V04 ^ V14 ^ V24);
+	sph_enc32be(out + 20, V05 ^ V15 ^ V25);
+	sph_enc32be(out + 24, V06 ^ V16 ^ V26);
+	if (out_size_w32 > 7)
+		sph_enc32be(out + 28, V07 ^ V17 ^ V27);
+}
+
+static void
+luffa4(sph_luffa384_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE4(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI4;
+			P4;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE4(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE4(sc);
+	for (i = 0; i < 3; i ++) {
+		MI4;
+		P4;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33);
+			break;
+		}
+	}
+}
+
+static void
+luffa5(sph_luffa512_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE5(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI5;
+			P5;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE5(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE5(sc);
+	for (i = 0; i < 3; i ++) {
+		MI5;
+		P5;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		}
+	}
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_init(void *cc)
+{
+	sph_luffa224_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_close(void *cc, void *dst)
+{
+	sph_luffa224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 7);
+	sph_luffa224_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_init(void *cc)
+{
+	sph_luffa256_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_close(void *cc, void *dst)
+{
+	sph_luffa256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 8);
+	sph_luffa256_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_init(void *cc)
+{
+	sph_luffa384_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384(void *cc, const void *data, size_t len)
+{
+	luffa4(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_close(void *cc, void *dst)
+{
+	sph_luffa384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa4_close(cc, ub, n, dst);
+	sph_luffa384_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_init(void *cc)
+{
+	sph_luffa512_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512(void *cc, const void *data, size_t len)
+{
+	luffa5(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_close(void *cc, void *dst)
+{
+	sph_luffa512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa5_close(cc, ub, n, dst);
+	sph_luffa512_init(cc);
+}
diff --git a/sph/shavite.c b/sph/shavite.c
new file mode 100644
index 00000000..b465e35d
--- /dev/null
+++ b/sph/shavite.c
@@ -0,0 +1,1756 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shavite.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
+	C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6),
+	C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47),
+	C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147),
+	C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C),
+	C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+/*
+ * This is the code needed to match the "reference implementation" as
+ * published on Nov 23rd, 2009, instead of the published specification.
+ * 
+
+#define AES_BIG_ENDIAN   1
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0),
+	C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5),
+	C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4),
+	C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9),
+	C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B),
+	C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47),
+	C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186),
+	C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316),
+	C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+ */
+
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 rk[144];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 64);
+#else
+	for (u = 0; u < 16; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 16;
+	for (r = 0; r < 4; r ++) {
+		for (s = 0; s < 2; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 16) {
+				rk[ 16] ^= sc->count0;
+				rk[ 17] ^= SPH_T32(~sc->count1);
+			} else if (u == 56) {
+				rk[ 57] ^= sc->count1;
+				rk[ 58] ^= SPH_T32(~sc->count0);
+			}
+			u += 4;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 84) {
+				rk[ 86] ^= sc->count1;
+				rk[ 87] ^= SPH_T32(~sc->count0);
+			} else if (u == 124) {
+				rk[124] ^= sc->count0;
+				rk[127] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		for (s = 0; s < 4; s ++) {
+			rk[u + 0] = rk[u - 16] ^ rk[u - 3];
+			rk[u + 1] = rk[u - 15] ^ rk[u - 2];
+			rk[u + 2] = rk[u - 14] ^ rk[u - 1];
+			rk[u + 3] = rk[u - 13] ^ rk[u - 0];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	u = 0;
+	for (r = 0; r < 6; r ++) {
+		sph_u32 x0, x1, x2, x3;
+
+		x0 = p4 ^ rk[u ++];
+		x1 = p5 ^ rk[u ++];
+		x2 = p6 ^ rk[u ++];
+		x3 = p7 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+
+		x0 = p0 ^ rk[u ++];
+		x1 = p1 ^ rk[u ++];
+		x2 = p2 ^ rk[u ++];
+		x3 = p3 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
+	sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	/* round 0 */
+	rk0 = sph_dec32le_aligned((const unsigned char *)msg +  0);
+	x0 = p4 ^ rk0;
+	rk1 = sph_dec32le_aligned((const unsigned char *)msg +  4);
+	x1 = p5 ^ rk1;
+	rk2 = sph_dec32le_aligned((const unsigned char *)msg +  8);
+	x2 = p6 ^ rk2;
+	rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12);
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16);
+	x0 ^= rk4;
+	rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20);
+	x1 ^= rk5;
+	rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24);
+	x2 ^= rk6;
+	rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28);
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32);
+	x0 ^= rk8;
+	rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36);
+	x1 ^= rk9;
+	rkA = sph_dec32le_aligned((const unsigned char *)msg + 40);
+	x2 ^= rkA;
+	rkB = sph_dec32le_aligned((const unsigned char *)msg + 44);
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 1 */
+	rkC = sph_dec32le_aligned((const unsigned char *)msg + 48);
+	x0 = p0 ^ rkC;
+	rkD = sph_dec32le_aligned((const unsigned char *)msg + 52);
+	x1 = p1 ^ rkD;
+	rkE = sph_dec32le_aligned((const unsigned char *)msg + 56);
+	x2 = p2 ^ rkE;
+	rkF = sph_dec32le_aligned((const unsigned char *)msg + 60);
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC ^ sc->count0;
+	rk1 ^= rkD ^ SPH_T32(~sc->count1);
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 2 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 3 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 4 */
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 = p4 ^ rk0;
+	x1 = p5 ^ rk1;
+	x2 = p6 ^ rk2;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5 ^ sc->count1;
+	rkA ^= rk6 ^ SPH_T32(~sc->count0);
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 5 */
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 = p0 ^ rkC;
+	x1 = p1 ^ rkD;
+	x2 = p2 ^ rkE;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 6 */
+	rk8 ^= rk5;
+	x0 = p4 ^ rk8;
+	rk9 ^= rk6;
+	x1 = p5 ^ rk9;
+	rkA ^= rk7;
+	x2 = p6 ^ rkA;
+	rkB ^= rk8;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 7 */
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2 ^ sc->count1;
+	rk7 ^= rk3 ^ SPH_T32(~sc->count0);
+	x0 = p0 ^ rk4;
+	x1 = p1 ^ rk5;
+	x2 = p2 ^ rk6;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 8 */
+	rk0 ^= rkD;
+	x0 = p4 ^ rk0;
+	rk1 ^= rkE;
+	x1 = p5 ^ rk1;
+	rk2 ^= rkF;
+	x2 = p6 ^ rk2;
+	rk3 ^= rk0;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 9 */
+	rkC ^= rk9;
+	x0 = p0 ^ rkC;
+	rkD ^= rkA;
+	x1 = p1 ^ rkD;
+	rkE ^= rkB;
+	x2 = p2 ^ rkE;
+	rkF ^= rkC;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 10 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8 ^ sc->count0;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB ^ SPH_T32(~sc->count1);
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 11 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07;
+	sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
+	sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
+	sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
+	int r;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	/* round 0 */
+	rk00 = sph_dec32le_aligned((const unsigned char *)msg +   0);
+	x0 = p4 ^ rk00;
+	rk01 = sph_dec32le_aligned((const unsigned char *)msg +   4);
+	x1 = p5 ^ rk01;
+	rk02 = sph_dec32le_aligned((const unsigned char *)msg +   8);
+	x2 = p6 ^ rk02;
+	rk03 = sph_dec32le_aligned((const unsigned char *)msg +  12);
+	x3 = p7 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk04 = sph_dec32le_aligned((const unsigned char *)msg +  16);
+	x0 ^= rk04;
+	rk05 = sph_dec32le_aligned((const unsigned char *)msg +  20);
+	x1 ^= rk05;
+	rk06 = sph_dec32le_aligned((const unsigned char *)msg +  24);
+	x2 ^= rk06;
+	rk07 = sph_dec32le_aligned((const unsigned char *)msg +  28);
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk08 = sph_dec32le_aligned((const unsigned char *)msg +  32);
+	x0 ^= rk08;
+	rk09 = sph_dec32le_aligned((const unsigned char *)msg +  36);
+	x1 ^= rk09;
+	rk0A = sph_dec32le_aligned((const unsigned char *)msg +  40);
+	x2 ^= rk0A;
+	rk0B = sph_dec32le_aligned((const unsigned char *)msg +  44);
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0C = sph_dec32le_aligned((const unsigned char *)msg +  48);
+	x0 ^= rk0C;
+	rk0D = sph_dec32le_aligned((const unsigned char *)msg +  52);
+	x1 ^= rk0D;
+	rk0E = sph_dec32le_aligned((const unsigned char *)msg +  56);
+	x2 ^= rk0E;
+	rk0F = sph_dec32le_aligned((const unsigned char *)msg +  60);
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk10 = sph_dec32le_aligned((const unsigned char *)msg +  64);
+	x0 = pC ^ rk10;
+	rk11 = sph_dec32le_aligned((const unsigned char *)msg +  68);
+	x1 = pD ^ rk11;
+	rk12 = sph_dec32le_aligned((const unsigned char *)msg +  72);
+	x2 = pE ^ rk12;
+	rk13 = sph_dec32le_aligned((const unsigned char *)msg +  76);
+	x3 = pF ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk14 = sph_dec32le_aligned((const unsigned char *)msg +  80);
+	x0 ^= rk14;
+	rk15 = sph_dec32le_aligned((const unsigned char *)msg +  84);
+	x1 ^= rk15;
+	rk16 = sph_dec32le_aligned((const unsigned char *)msg +  88);
+	x2 ^= rk16;
+	rk17 = sph_dec32le_aligned((const unsigned char *)msg +  92);
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk18 = sph_dec32le_aligned((const unsigned char *)msg +  96);
+	x0 ^= rk18;
+	rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100);
+	x1 ^= rk19;
+	rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104);
+	x2 ^= rk1A;
+	rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108);
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112);
+	x0 ^= rk1C;
+	rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116);
+	x1 ^= rk1D;
+	rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120);
+	x2 ^= rk1E;
+	rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124);
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	for (r = 0; r < 3; r ++) {
+		/* round 1, 5, 9 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		if (r == 0) {
+			rk00 ^= sc->count0;
+			rk01 ^= sc->count1;
+			rk02 ^= sc->count2;
+			rk03 ^= SPH_T32(~sc->count3);
+		}
+		x0 = p0 ^ rk00;
+		x1 = p1 ^ rk01;
+		x2 = p2 ^ rk02;
+		x3 = p3 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		if (r == 1) {
+			rk04 ^= sc->count3;
+			rk05 ^= sc->count2;
+			rk06 ^= sc->count1;
+			rk07 ^= SPH_T32(~sc->count0);
+		}
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p8 ^ rk10;
+		x1 = p9 ^ rk11;
+		x2 = pA ^ rk12;
+		x3 = pB ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		if (r == 2) {
+			rk1C ^= sc->count2;
+			rk1D ^= sc->count3;
+			rk1E ^= sc->count0;
+			rk1F ^= SPH_T32(~sc->count1);
+		}
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		/* round 2, 6, 10 */
+		rk00 ^= rk19;
+		x0 = pC ^ rk00;
+		rk01 ^= rk1A;
+		x1 = pD ^ rk01;
+		rk02 ^= rk1B;
+		x2 = pE ^ rk02;
+		rk03 ^= rk1C;
+		x3 = pF ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+		rk10 ^= rk09;
+		x0 = p4 ^ rk10;
+		rk11 ^= rk0A;
+		x1 = p5 ^ rk11;
+		rk12 ^= rk0B;
+		x2 = p6 ^ rk12;
+		rk13 ^= rk0C;
+		x3 = p7 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		/* round 3, 7, 11 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		x0 = p8 ^ rk00;
+		x1 = p9 ^ rk01;
+		x2 = pA ^ rk02;
+		x3 = pB ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p0 ^ rk10;
+		x1 = p1 ^ rk11;
+		x2 = p2 ^ rk12;
+		x3 = p3 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		/* round 4, 8, 12 */
+		rk00 ^= rk19;
+		x0 = p4 ^ rk00;
+		rk01 ^= rk1A;
+		x1 = p5 ^ rk01;
+		rk02 ^= rk1B;
+		x2 = p6 ^ rk02;
+		rk03 ^= rk1C;
+		x3 = p7 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		rk10 ^= rk09;
+		x0 = pC ^ rk10;
+		rk11 ^= rk0A;
+		x1 = pD ^ rk11;
+		rk12 ^= rk0B;
+		x2 = pE ^ rk12;
+		rk13 ^= rk0C;
+		x3 = pF ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+	}
+	/* round 13 */
+	KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+	rk00 ^= rk1C;
+	rk01 ^= rk1D;
+	rk02 ^= rk1E;
+	rk03 ^= rk1F;
+	x0 = p0 ^ rk00;
+	x1 = p1 ^ rk01;
+	x2 = p2 ^ rk02;
+	x3 = p3 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+	rk04 ^= rk00;
+	rk05 ^= rk01;
+	rk06 ^= rk02;
+	rk07 ^= rk03;
+	x0 ^= rk04;
+	x1 ^= rk05;
+	x2 ^= rk06;
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+	rk08 ^= rk04;
+	rk09 ^= rk05;
+	rk0A ^= rk06;
+	rk0B ^= rk07;
+	x0 ^= rk08;
+	x1 ^= rk09;
+	x2 ^= rk0A;
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+	rk0C ^= rk08;
+	rk0D ^= rk09;
+	rk0E ^= rk0A;
+	rk0F ^= rk0B;
+	x0 ^= rk0C;
+	x1 ^= rk0D;
+	x2 ^= rk0E;
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+	rk10 ^= rk0C;
+	rk11 ^= rk0D;
+	rk12 ^= rk0E;
+	rk13 ^= rk0F;
+	x0 = p8 ^ rk10;
+	x1 = p9 ^ rk11;
+	x2 = pA ^ rk12;
+	x3 = pB ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+	rk14 ^= rk10;
+	rk15 ^= rk11;
+	rk16 ^= rk12;
+	rk17 ^= rk13;
+	x0 ^= rk14;
+	x1 ^= rk15;
+	x2 ^= rk16;
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+	rk18 ^= rk14 ^ sc->count1;
+	rk19 ^= rk15 ^ sc->count0;
+	rk1A ^= rk16 ^ sc->count3;
+	rk1B ^= rk17 ^ SPH_T32(~sc->count2);
+	x0 ^= rk18;
+	x1 ^= rk19;
+	x2 ^= rk1A;
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+	rk1C ^= rk18;
+	rk1D ^= rk19;
+	rk1E ^= rk1A;
+	rk1F ^= rk1B;
+	x0 ^= rk1C;
+	x1 ^= rk1D;
+	x2 ^= rk1E;
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p8;
+	sc->h[0x1] ^= p9;
+	sc->h[0x2] ^= pA;
+	sc->h[0x3] ^= pB;
+	sc->h[0x4] ^= pC;
+	sc->h[0x5] ^= pD;
+	sc->h[0x6] ^= pE;
+	sc->h[0x7] ^= pF;
+	sc->h[0x8] ^= p0;
+	sc->h[0x9] ^= p1;
+	sc->h[0xA] ^= p2;
+	sc->h[0xB] ^= p3;
+	sc->h[0xC] ^= p4;
+	sc->h[0xD] ^= p5;
+	sc->h[0xE] ^= p6;
+	sc->h[0xF] ^= p7;
+}
+
+#endif
+
+static void
+shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+}
+
+static void
+shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0)
+				sc->count1 = SPH_T32(sc->count1 + 1);
+			c256(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_small_close(sph_shavite_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 53);
+		sc->count0 = sc->count1 = 0;
+	} else if (ptr < 54) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 54 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 64 - ptr);
+		c256(sc, buf);
+		memset(buf, 0, 54);
+		sc->count0 = sc->count1 = 0;
+	}
+	sph_enc32le(buf + 54, count0);
+	sph_enc32le(buf + 58, count1);
+	buf[62] = out_size_w32 << 5;
+	buf[63] = out_size_w32 >> 3;
+	c256(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+static void
+shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_close(sph_shavite_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = out_size_w32 << 5;
+	buf[127] = out_size_w32 >> 3;
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_init(void *cc)
+{
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_init(void *cc)
+{
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_init(void *cc)
+{
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_init(void *cc)
+{
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 16);
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 16);
+	shavite_big_init(cc, IV512);
+}
diff --git a/sph/simd.c b/sph/simd.c
new file mode 100644
index 00000000..cef985dd
--- /dev/null
+++ b/sph/simd.c
@@ -0,0 +1,1792 @@
+/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SIMD implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_simd.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
+#define SPH_SMALL_FOOTPRINT_SIMD   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+typedef sph_u32 u32;
+typedef sph_s32 s32;
+#define C32     SPH_C32
+#define T32     SPH_T32
+#define ROL32   SPH_ROTL32
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+/*
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
+ */
+static const s32 alpha_tab[] = {
+	  1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
+	190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
+	120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
+	184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
+	  8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
+	235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
+	189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
+	187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
+	 64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
+	 81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
+	227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
+	211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
+	255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
+	134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
+	 17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
+	146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
+	241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
+	 44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
+	136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
+	140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
+	129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
+	 95,  40,  98, 163
+};
+
+/*
+ * Ranges:
+ *   REDS1: from -32768..98302 to -383..383
+ *   REDS2: from -2^31..2^31-1 to -32768..98302
+ */
+#define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
+#define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))
+
+/*
+ * If, upon entry, the values of q[] are all in the -N..N range (where
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
+ *
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
+ */
+#define FFT_LOOP(rb, hk, as, id)   do { \
+		size_t u, v; \
+		s32 m = q[(rb)]; \
+		s32 n = q[(rb) + (hk)]; \
+		q[(rb)] = m + n; \
+		q[(rb) + (hk)] = m - n; \
+		u = v = 0; \
+		goto id; \
+		for (; u < (hk); u += 4, v += 4 * (as)) { \
+			s32 t; \
+			m = q[(rb) + u + 0]; \
+			n = q[(rb) + u + 0 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 0 * (as)]); \
+			q[(rb) + u + 0] = m + t; \
+			q[(rb) + u + 0 + (hk)] = m - t; \
+		id: \
+			m = q[(rb) + u + 1]; \
+			n = q[(rb) + u + 1 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 1 * (as)]); \
+			q[(rb) + u + 1] = m + t; \
+			q[(rb) + u + 1 + (hk)] = m - t; \
+			m = q[(rb) + u + 2]; \
+			n = q[(rb) + u + 2 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 2 * (as)]); \
+			q[(rb) + u + 2] = m + t; \
+			q[(rb) + u + 2 + (hk)] = m - t; \
+			m = q[(rb) + u + 3]; \
+			n = q[(rb) + u + 3 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 3 * (as)]); \
+			q[(rb) + u + 3] = m + t; \
+			q[(rb) + u + 3 + (hk)] = m - t; \
+		} \
+	} while (0)
+
+/*
+ * Output ranges:
+ *   d0:   min=    0   max= 1020
+ *   d1:   min=  -67   max= 4587
+ *   d2:   min=-4335   max= 4335
+ *   d3:   min=-4147   max=  507
+ *   d4:   min= -510   max=  510
+ *   d5:   min= -252   max= 4402
+ *   d6:   min=-4335   max= 4335
+ *   d7:   min=-4332   max=  322
+ */
+#define FFT8(xb, xs, d)   do { \
+		s32 x0 = x[(xb)]; \
+		s32 x1 = x[(xb) + (xs)]; \
+		s32 x2 = x[(xb) + 2 * (xs)]; \
+		s32 x3 = x[(xb) + 3 * (xs)]; \
+		s32 a0 = x0 + x2; \
+		s32 a1 = x0 + (x2 << 4); \
+		s32 a2 = x0 - x2; \
+		s32 a3 = x0 - (x2 << 4); \
+		s32 b0 = x1 + x3; \
+		s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
+		s32 b2 = (x1 << 4) - (x3 << 4); \
+		s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
+		d ## 0 = a0 + b0; \
+		d ## 1 = a1 + b1; \
+		d ## 2 = a2 + b2; \
+		d ## 3 = a3 + b3; \
+		d ## 4 = a0 - b0; \
+		d ## 5 = a1 - b1; \
+		d ## 6 = a2 - b2; \
+		d ## 7 = a3 - b3; \
+	} while (0)
+
+/*
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
+ * to some shifting.
+ *
+ * Output: within -591471..591723
+ */
+#define FFT16(xb, xs, rb)   do { \
+		s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
+		s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
+		FFT8(xb, (xs) << 1, d1_); \
+		FFT8((xb) + (xs), (xs) << 1, d2_); \
+		q[(rb) +  0] = d1_0 + d2_0; \
+		q[(rb) +  1] = d1_1 + (d2_1 << 1); \
+		q[(rb) +  2] = d1_2 + (d2_2 << 2); \
+		q[(rb) +  3] = d1_3 + (d2_3 << 3); \
+		q[(rb) +  4] = d1_4 + (d2_4 << 4); \
+		q[(rb) +  5] = d1_5 + (d2_5 << 5); \
+		q[(rb) +  6] = d1_6 + (d2_6 << 6); \
+		q[(rb) +  7] = d1_7 + (d2_7 << 7); \
+		q[(rb) +  8] = d1_0 - d2_0; \
+		q[(rb) +  9] = d1_1 - (d2_1 << 1); \
+		q[(rb) + 10] = d1_2 - (d2_2 << 2); \
+		q[(rb) + 11] = d1_3 - (d2_3 << 3); \
+		q[(rb) + 12] = d1_4 - (d2_4 << 4); \
+		q[(rb) + 13] = d1_5 - (d2_5 << 5); \
+		q[(rb) + 14] = d1_6 - (d2_6 << 6); \
+		q[(rb) + 15] = d1_7 - (d2_7 << 7); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 1183446
+ */
+#define FFT32(xb, xs, rb, id)   do { \
+		FFT16(xb, (xs) << 1, rb); \
+		FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
+		FFT_LOOP(rb, 16, 8, id); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 2366892
+ */
+#define FFT64(xb, xs, rb, id)   do { \
+		FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
+		FFT_LOOP(rb, 32, 4, id); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+static void
+fft32(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT16(0, xd, 0);
+	FFT16(xs, xd, 16);
+	FFT_LOOP(0, 16, 8, label_);
+}
+
+#define FFT128(xb, xs, rb, id)   do { \
+		fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
+		fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
+		FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
+		fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
+		fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
+		FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
+	} while (0)
+
+#else
+
+/*
+ * Output range: |q| <= 4733784
+ */
+#define FFT128(xb, xs, rb, id)   do { \
+		FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
+		FFT_LOOP(rb, 64, 2, id); \
+	} while (0)
+
+#endif
+
+/*
+ * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
+ * function which does not fit in the 32 kB L1 cache of a typical x86
+ * Intel. We therefore add a function call layer at the FFT64 level.
+ */
+
+static void
+fft64(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT32(0, xd, 0, label_a);
+	FFT32(xs, xd, 32, label_b);
+	FFT_LOOP(0, 32, 4, label_);
+}
+
+/*
+ * Output range: |q| <= 9467568
+ */
+#define FFT256(xb, xs, rb, id)   do { \
+		fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
+		fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
+		fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
+		fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
+		FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
+		FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
+	} while (0)
+
+/*
+ * alpha^(127*i) mod 257
+ */
+static const unsigned short yoff_s_n[] = {
+	  1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
+	 15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
+	225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
+	 34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
+	253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
+	197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
+	128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
+	121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
+	 16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
+	240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
+	  2, 196, 190, 116,  60, 226,  46, 139
+};
+
+/*
+ * alpha^(127*i) + alpha^(125*i) mod 257
+ */
+static const unsigned short yoff_s_f[] = {
+	  2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
+	 49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
+	 96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
+	 17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
+	189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
+	 77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
+	160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
+	181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
+	  0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
+	210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
+	 10, 146, 117, 251, 180, 247, 236, 108
+};
+
+/*
+ * beta^(255*i) mod 257
+ */
+static const unsigned short yoff_b_n[] = {
+	  1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
+	 23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
+	 15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
+	 88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
+	225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
+	 35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
+	 34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
+	 11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
+	253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
+	165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
+	197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
+	162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
+	128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
+	117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
+	121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
+	213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
+	 16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
+	111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
+	240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
+	123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
+	  2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
+	 46,  45, 139,  41
+};
+
+/*
+ * beta^(255*i) + beta^(253*i) mod 257
+ */
+static const unsigned short yoff_b_f[] = {
+	  2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
+	111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
+	 49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
+	253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
+	 96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
+	248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
+	 17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
+	 57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
+	189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
+	187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
+	 77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
+	139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
+	160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
+	106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
+	181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
+	 96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
+	  0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
+	245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
+	210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
+	 53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
+	 10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
+	236, 192, 108,  86
+};
+
+#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
+                          + ((u32)((h) * (mm)) << 16))
+
+#define W_SMALL(sb, o1, o2, mm) \
+	(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
+
+#define WS_0_0   W_SMALL( 4,    0,    1, 185)
+#define WS_0_1   W_SMALL( 6,    0,    1, 185)
+#define WS_0_2   W_SMALL( 0,    0,    1, 185)
+#define WS_0_3   W_SMALL( 2,    0,    1, 185)
+#define WS_0_4   W_SMALL( 7,    0,    1, 185)
+#define WS_0_5   W_SMALL( 5,    0,    1, 185)
+#define WS_0_6   W_SMALL( 3,    0,    1, 185)
+#define WS_0_7   W_SMALL( 1,    0,    1, 185)
+#define WS_1_0   W_SMALL(15,    0,    1, 185)
+#define WS_1_1   W_SMALL(11,    0,    1, 185)
+#define WS_1_2   W_SMALL(12,    0,    1, 185)
+#define WS_1_3   W_SMALL( 8,    0,    1, 185)
+#define WS_1_4   W_SMALL( 9,    0,    1, 185)
+#define WS_1_5   W_SMALL(13,    0,    1, 185)
+#define WS_1_6   W_SMALL(10,    0,    1, 185)
+#define WS_1_7   W_SMALL(14,    0,    1, 185)
+#define WS_2_0   W_SMALL(17, -128,  -64, 233)
+#define WS_2_1   W_SMALL(18, -128,  -64, 233)
+#define WS_2_2   W_SMALL(23, -128,  -64, 233)
+#define WS_2_3   W_SMALL(20, -128,  -64, 233)
+#define WS_2_4   W_SMALL(22, -128,  -64, 233)
+#define WS_2_5   W_SMALL(21, -128,  -64, 233)
+#define WS_2_6   W_SMALL(16, -128,  -64, 233)
+#define WS_2_7   W_SMALL(19, -128,  -64, 233)
+#define WS_3_0   W_SMALL(30, -191, -127, 233)
+#define WS_3_1   W_SMALL(24, -191, -127, 233)
+#define WS_3_2   W_SMALL(25, -191, -127, 233)
+#define WS_3_3   W_SMALL(31, -191, -127, 233)
+#define WS_3_4   W_SMALL(27, -191, -127, 233)
+#define WS_3_5   W_SMALL(29, -191, -127, 233)
+#define WS_3_6   W_SMALL(28, -191, -127, 233)
+#define WS_3_7   W_SMALL(26, -191, -127, 233)
+
+#define W_BIG(sb, o1, o2, mm) \
+	(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
+
+#define WB_0_0   W_BIG( 4,    0,    1, 185)
+#define WB_0_1   W_BIG( 6,    0,    1, 185)
+#define WB_0_2   W_BIG( 0,    0,    1, 185)
+#define WB_0_3   W_BIG( 2,    0,    1, 185)
+#define WB_0_4   W_BIG( 7,    0,    1, 185)
+#define WB_0_5   W_BIG( 5,    0,    1, 185)
+#define WB_0_6   W_BIG( 3,    0,    1, 185)
+#define WB_0_7   W_BIG( 1,    0,    1, 185)
+#define WB_1_0   W_BIG(15,    0,    1, 185)
+#define WB_1_1   W_BIG(11,    0,    1, 185)
+#define WB_1_2   W_BIG(12,    0,    1, 185)
+#define WB_1_3   W_BIG( 8,    0,    1, 185)
+#define WB_1_4   W_BIG( 9,    0,    1, 185)
+#define WB_1_5   W_BIG(13,    0,    1, 185)
+#define WB_1_6   W_BIG(10,    0,    1, 185)
+#define WB_1_7   W_BIG(14,    0,    1, 185)
+#define WB_2_0   W_BIG(17, -256, -128, 233)
+#define WB_2_1   W_BIG(18, -256, -128, 233)
+#define WB_2_2   W_BIG(23, -256, -128, 233)
+#define WB_2_3   W_BIG(20, -256, -128, 233)
+#define WB_2_4   W_BIG(22, -256, -128, 233)
+#define WB_2_5   W_BIG(21, -256, -128, 233)
+#define WB_2_6   W_BIG(16, -256, -128, 233)
+#define WB_2_7   W_BIG(19, -256, -128, 233)
+#define WB_3_0   W_BIG(30, -383, -255, 233)
+#define WB_3_1   W_BIG(24, -383, -255, 233)
+#define WB_3_2   W_BIG(25, -383, -255, 233)
+#define WB_3_3   W_BIG(31, -383, -255, 233)
+#define WB_3_4   W_BIG(27, -383, -255, 233)
+#define WB_3_5   W_BIG(29, -383, -255, 233)
+#define WB_3_6   W_BIG(28, -383, -255, 233)
+#define WB_3_7   W_BIG(26, -383, -255, 233)
+
+#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))
+
+#define PP4_0_0   1
+#define PP4_0_1   0
+#define PP4_0_2   3
+#define PP4_0_3   2
+#define PP4_1_0   2
+#define PP4_1_1   3
+#define PP4_1_2   0
+#define PP4_1_3   1
+#define PP4_2_0   3
+#define PP4_2_1   2
+#define PP4_2_2   1
+#define PP4_2_3   0
+
+#define PP8_0_0   1
+#define PP8_0_1   0
+#define PP8_0_2   3
+#define PP8_0_3   2
+#define PP8_0_4   5
+#define PP8_0_5   4
+#define PP8_0_6   7
+#define PP8_0_7   6
+
+#define PP8_1_0   6
+#define PP8_1_1   7
+#define PP8_1_2   4
+#define PP8_1_3   5
+#define PP8_1_4   2
+#define PP8_1_5   3
+#define PP8_1_6   0
+#define PP8_1_7   1
+
+#define PP8_2_0   2
+#define PP8_2_1   3
+#define PP8_2_2   0
+#define PP8_2_3   1
+#define PP8_2_4   6
+#define PP8_2_5   7
+#define PP8_2_6   4
+#define PP8_2_7   5
+
+#define PP8_3_0   3
+#define PP8_3_1   2
+#define PP8_3_2   1
+#define PP8_3_3   0
+#define PP8_3_4   7
+#define PP8_3_5   6
+#define PP8_3_6   5
+#define PP8_3_7   4
+
+#define PP8_4_0   5
+#define PP8_4_1   4
+#define PP8_4_2   7
+#define PP8_4_3   6
+#define PP8_4_4   1
+#define PP8_4_5   0
+#define PP8_4_6   3
+#define PP8_4_7   2
+
+#define PP8_5_0   7
+#define PP8_5_1   6
+#define PP8_5_2   5
+#define PP8_5_3   4
+#define PP8_5_4   3
+#define PP8_5_5   2
+#define PP8_5_6   1
+#define PP8_5_7   0
+
+#define PP8_6_0   4
+#define PP8_6_1   5
+#define PP8_6_2   6
+#define PP8_6_3   7
+#define PP8_6_4   0
+#define PP8_6_5   1
+#define PP8_6_6   2
+#define PP8_6_7   3
+
+#if SPH_SIMD_NOCOPY
+
+#define DECL_STATE_SMALL
+#define READ_STATE_SMALL(sc)
+#define WRITE_STATE_SMALL(sc)
+#define DECL_STATE_BIG
+#define READ_STATE_BIG(sc)
+#define WRITE_STATE_BIG(sc)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
+
+#define READ_STATE_SMALL(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		B0 = (sc)->state[ 4]; \
+		B1 = (sc)->state[ 5]; \
+		B2 = (sc)->state[ 6]; \
+		B3 = (sc)->state[ 7]; \
+		C0 = (sc)->state[ 8]; \
+		C1 = (sc)->state[ 9]; \
+		C2 = (sc)->state[10]; \
+		C3 = (sc)->state[11]; \
+		D0 = (sc)->state[12]; \
+		D1 = (sc)->state[13]; \
+		D2 = (sc)->state[14]; \
+		D3 = (sc)->state[15]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = B0; \
+		(sc)->state[ 5] = B1; \
+		(sc)->state[ 6] = B2; \
+		(sc)->state[ 7] = B3; \
+		(sc)->state[ 8] = C0; \
+		(sc)->state[ 9] = C1; \
+		(sc)->state[10] = C2; \
+		(sc)->state[11] = C3; \
+		(sc)->state[12] = D0; \
+		(sc)->state[13] = D1; \
+		(sc)->state[14] = D2; \
+		(sc)->state[15] = D3; \
+	} while (0)
+
+#define DECL_STATE_BIG   \
+	u32 A0, A1, A2, A3, A4, A5, A6, A7; \
+	u32 B0, B1, B2, B3, B4, B5, B6, B7; \
+	u32 C0, C1, C2, C3, C4, C5, C6, C7; \
+	u32 D0, D1, D2, D3, D4, D5, D6, D7;
+
+#define READ_STATE_BIG(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		A4 = (sc)->state[ 4]; \
+		A5 = (sc)->state[ 5]; \
+		A6 = (sc)->state[ 6]; \
+		A7 = (sc)->state[ 7]; \
+		B0 = (sc)->state[ 8]; \
+		B1 = (sc)->state[ 9]; \
+		B2 = (sc)->state[10]; \
+		B3 = (sc)->state[11]; \
+		B4 = (sc)->state[12]; \
+		B5 = (sc)->state[13]; \
+		B6 = (sc)->state[14]; \
+		B7 = (sc)->state[15]; \
+		C0 = (sc)->state[16]; \
+		C1 = (sc)->state[17]; \
+		C2 = (sc)->state[18]; \
+		C3 = (sc)->state[19]; \
+		C4 = (sc)->state[20]; \
+		C5 = (sc)->state[21]; \
+		C6 = (sc)->state[22]; \
+		C7 = (sc)->state[23]; \
+		D0 = (sc)->state[24]; \
+		D1 = (sc)->state[25]; \
+		D2 = (sc)->state[26]; \
+		D3 = (sc)->state[27]; \
+		D4 = (sc)->state[28]; \
+		D5 = (sc)->state[29]; \
+		D6 = (sc)->state[30]; \
+		D7 = (sc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = A4; \
+		(sc)->state[ 5] = A5; \
+		(sc)->state[ 6] = A6; \
+		(sc)->state[ 7] = A7; \
+		(sc)->state[ 8] = B0; \
+		(sc)->state[ 9] = B1; \
+		(sc)->state[10] = B2; \
+		(sc)->state[11] = B3; \
+		(sc)->state[12] = B4; \
+		(sc)->state[13] = B5; \
+		(sc)->state[14] = B6; \
+		(sc)->state[15] = B7; \
+		(sc)->state[16] = C0; \
+		(sc)->state[17] = C1; \
+		(sc)->state[18] = C2; \
+		(sc)->state[19] = C3; \
+		(sc)->state[20] = C4; \
+		(sc)->state[21] = C5; \
+		(sc)->state[22] = C6; \
+		(sc)->state[23] = C7; \
+		(sc)->state[24] = D0; \
+		(sc)->state[25] = D1; \
+		(sc)->state[26] = D2; \
+		(sc)->state[27] = D3; \
+		(sc)->state[28] = D4; \
+		(sc)->state[29] = D5; \
+		(sc)->state[30] = D6; \
+		(sc)->state[31] = D7; \
+	} while (0)
+
+#endif
+
+#define STEP_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA ## n; \
+	} while (0)
+
+#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		STEP_ELT(0, w0, fun, s, pp4b); \
+		STEP_ELT(1, w1, fun, s, pp4b); \
+		STEP_ELT(2, w2, fun, s, pp4b); \
+		STEP_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		u32 tA4 = ROL32(A4, r); \
+		u32 tA5 = ROL32(A5, r); \
+		u32 tA6 = ROL32(A6, r); \
+		u32 tA7 = ROL32(A7, r); \
+		STEP_ELT(0, w0, fun, s, pp8b); \
+		STEP_ELT(1, w1, fun, s, pp8b); \
+		STEP_ELT(2, w2, fun, s, pp8b); \
+		STEP_ELT(3, w3, fun, s, pp8b); \
+		STEP_ELT(4, w4, fun, s, pp8b); \
+		STEP_ELT(5, w5, fun, s, pp8b); \
+		STEP_ELT(6, w6, fun, s, pp8b); \
+		STEP_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+#define M3_0_0   0_
+#define M3_1_0   1_
+#define M3_2_0   2_
+#define M3_3_0   0_
+#define M3_4_0   1_
+#define M3_5_0   2_
+#define M3_6_0   0_
+#define M3_7_0   1_
+
+#define M3_0_1   1_
+#define M3_1_1   2_
+#define M3_2_1   0_
+#define M3_3_1   1_
+#define M3_4_1   2_
+#define M3_5_1   0_
+#define M3_6_1   1_
+#define M3_7_1   2_
+
+#define M3_0_2   2_
+#define M3_1_2   0_
+#define M3_2_2   1_
+#define M3_3_2   2_
+#define M3_4_2   0_
+#define M3_5_2   1_
+#define M3_6_2   2_
+#define M3_7_2   0_
+
+#define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)
+
+#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_SMALL_(WS_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
+	} while (0)
+
+#define M7_0_0   0_
+#define M7_1_0   1_
+#define M7_2_0   2_
+#define M7_3_0   3_
+#define M7_4_0   4_
+#define M7_5_0   5_
+#define M7_6_0   6_
+#define M7_7_0   0_
+
+#define M7_0_1   1_
+#define M7_1_1   2_
+#define M7_2_1   3_
+#define M7_3_1   4_
+#define M7_4_1   5_
+#define M7_5_1   6_
+#define M7_6_1   0_
+#define M7_7_1   1_
+
+#define M7_0_2   2_
+#define M7_1_2   3_
+#define M7_2_2   4_
+#define M7_3_2   5_
+#define M7_4_2   6_
+#define M7_5_2   0_
+#define M7_6_2   1_
+#define M7_7_2   2_
+
+#define M7_0_3   3_
+#define M7_1_3   4_
+#define M7_2_3   5_
+#define M7_3_3   6_
+#define M7_4_3   0_
+#define M7_5_3   1_
+#define M7_6_3   2_
+#define M7_7_3   3_
+
+#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)
+
+#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_BIG_(WB_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define B0   state[ 4]
+#define B1   state[ 5]
+#define B2   state[ 6]
+#define B3   state[ 7]
+#define C0   state[ 8]
+#define C1   state[ 9]
+#define C2   state[10]
+#define C3   state[11]
+#define D0   state[12]
+#define D1   state[13]
+#define D2   state[14]
+#define D3   state[15]
+
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+
+#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA[4]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		STEP2_ELT(0, w0, fun, s, pp4b); \
+		STEP2_ELT(1, w1, fun, s, pp4b); \
+		STEP2_ELT(2, w2, fun, s, pp4b); \
+		STEP2_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+static void
+one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
+
+	STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
+	STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
+	STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
+	STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
+	STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
+	STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
+	STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
+	STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
+}
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	u32 w[32];
+	u32 state[16];
+	size_t u;
+
+	static const size_t wsp[32] = {
+		 4 << 3,  6 << 3,  0 << 3,  2 << 3,
+		 7 << 3,  5 << 3,  3 << 3,  1 << 3,
+		15 << 3, 11 << 3, 12 << 3,  8 << 3,
+		 9 << 3, 13 << 3, 10 << 3, 14 << 3,
+		17 << 3, 18 << 3, 23 << 3, 20 << 3,
+		22 << 3, 21 << 3, 16 << 3, 19 << 3,
+		30 << 3, 24 << 3, 25 << 3, 31 << 3,
+		27 << 3, 29 << 3, 28 << 3, 26 << 3
+	};
+
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 16; i += 4) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+	}
+
+#define WSREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 32; u += 4) { \
+			size_t v = wsp[(u >> 2) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WSREAD( 0,    0,    1, 185);
+	one_round_small(state, w, 0,  3, 23, 17, 27);
+	WSREAD( 8,    0,    1, 185);
+	one_round_small(state, w, 2, 28, 19, 22,  7);
+	WSREAD(16, -128,  -64, 233);
+	one_round_small(state, w, 1, 29,  9, 15,  5);
+	WSREAD(24, -191, -127, 233);
+	one_round_small(state, w, 0,  4, 13, 10, 25);
+
+#undef WSREAD
+
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define B0   (sc->state[ 4])
+#define B1   (sc->state[ 5])
+#define B2   (sc->state[ 6])
+#define B3   (sc->state[ 7])
+#define C0   (sc->state[ 8])
+#define C1   (sc->state[ 9])
+#define C2   (sc->state[10])
+#define C3   (sc->state[11])
+#define D0   (sc->state[12])
+#define D1   (sc->state[13])
+#define D2   (sc->state[14])
+#define D3   (sc->state[15])
+#endif
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	DECL_STATE_SMALL
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[16];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_SMALL(sc);
+	A0 ^= sph_dec32le_aligned(x +  0);
+	A1 ^= sph_dec32le_aligned(x +  4);
+	A2 ^= sph_dec32le_aligned(x +  8);
+	A3 ^= sph_dec32le_aligned(x + 12);
+	B0 ^= sph_dec32le_aligned(x + 16);
+	B1 ^= sph_dec32le_aligned(x + 20);
+	B2 ^= sph_dec32le_aligned(x + 24);
+	B3 ^= sph_dec32le_aligned(x + 28);
+	C0 ^= sph_dec32le_aligned(x + 32);
+	C1 ^= sph_dec32le_aligned(x + 36);
+	C2 ^= sph_dec32le_aligned(x + 40);
+	C3 ^= sph_dec32le_aligned(x + 44);
+	D0 ^= sph_dec32le_aligned(x + 48);
+	D1 ^= sph_dec32le_aligned(x + 52);
+	D2 ^= sph_dec32le_aligned(x + 56);
+	D3 ^= sph_dec32le_aligned(x + 60);
+	ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
+	ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
+	ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
+		IF, 25,  4, PP4_2_);
+#else
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+	WRITE_STATE_SMALL(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#endif
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define A4   state[ 4]
+#define A5   state[ 5]
+#define A6   state[ 6]
+#define A7   state[ 7]
+#define B0   state[ 8]
+#define B1   state[ 9]
+#define B2   state[10]
+#define B3   state[11]
+#define B4   state[12]
+#define B5   state[13]
+#define B6   state[14]
+#define B7   state[15]
+#define C0   state[16]
+#define C1   state[17]
+#define C2   state[18]
+#define C3   state[19]
+#define C4   state[20]
+#define C5   state[21]
+#define C6   state[22]
+#define C7   state[23]
+#define D0   state[24]
+#define D1   state[25]
+#define D2   state[26]
+#define D3   state[27]
+#define D4   state[28]
+#define D5   state[29]
+#define D6   state[30]
+#define D7   state[31]
+
+/*
+ * Not needed -- already defined for SIMD-224 / SIMD-256
+ *
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+ */
+
+#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA[8]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		tA[4] = ROL32(A4, r); \
+		tA[5] = ROL32(A5, r); \
+		tA[6] = ROL32(A6, r); \
+		tA[7] = ROL32(A7, r); \
+		STEP2_ELT(0, w0, fun, s, pp8b); \
+		STEP2_ELT(1, w1, fun, s, pp8b); \
+		STEP2_ELT(2, w2, fun, s, pp8b); \
+		STEP2_ELT(3, w3, fun, s, pp8b); \
+		STEP2_ELT(4, w4, fun, s, pp8b); \
+		STEP2_ELT(5, w5, fun, s, pp8b); \
+		STEP2_ELT(6, w6, fun, s, pp8b); \
+		STEP2_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+static void
+one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
+
+	STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
+		IF,  p0, p1, pp8k[isp + 0]);
+	STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
+		IF,  p1, p2, pp8k[isp + 1]);
+	STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
+		IF,  p2, p3, pp8k[isp + 2]);
+	STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
+		IF,  p3, p0, pp8k[isp + 3]);
+	STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
+		MAJ, p0, p1, pp8k[isp + 4]);
+	STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
+		MAJ, p1, p2, pp8k[isp + 5]);
+	STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
+		MAJ, p2, p3, pp8k[isp + 6]);
+	STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
+		MAJ, p3, p0, pp8k[isp + 7]);
+}
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	u32 w[64];
+	u32 state[32];
+	size_t u;
+
+	static const size_t wbp[32] = {
+		 4 << 4,  6 << 4,  0 << 4,  2 << 4,
+		 7 << 4,  5 << 4,  3 << 4,  1 << 4,
+		15 << 4, 11 << 4, 12 << 4,  8 << 4,
+		 9 << 4, 13 << 4, 10 << 4, 14 << 4,
+		17 << 4, 18 << 4, 23 << 4, 20 << 4,
+		22 << 4, 21 << 4, 16 << 4, 19 << 4,
+		30 << 4, 24 << 4, 25 << 4, 31 << 4,
+		27 << 4, 29 << 4, 28 << 4, 26 << 4
+	};
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 32; i += 8) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+		state[i + 4] = sc->state[i + 4]
+			^ sph_dec32le_aligned(x + 4 * (i + 4));
+		state[i + 5] = sc->state[i + 5]
+			^ sph_dec32le_aligned(x + 4 * (i + 5));
+		state[i + 6] = sc->state[i + 6]
+			^ sph_dec32le_aligned(x + 4 * (i + 6));
+		state[i + 7] = sc->state[i + 7]
+			^ sph_dec32le_aligned(x + 4 * (i + 7));
+	}
+
+#define WBREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 64; u += 8) { \
+			size_t v = wbp[(u >> 3) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+			w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
+				q[v + 2 * 4 + (o2)], mm); \
+			w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
+				q[v + 2 * 5 + (o2)], mm); \
+			w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
+				q[v + 2 * 6 + (o2)], mm); \
+			w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
+				q[v + 2 * 7 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WBREAD( 0,    0,    1, 185);
+	one_round_big(state, w, 0,  3, 23, 17, 27);
+	WBREAD( 8,    0,    1, 185);
+	one_round_big(state, w, 1, 28, 19, 22,  7);
+	WBREAD(16, -256, -128, 233);
+	one_round_big(state, w, 2, 29,  9, 15,  5);
+	WBREAD(24, -383, -255, 233);
+	one_round_big(state, w, 3,  4, 13, 10, 25);
+
+#undef WBREAD
+
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define A4   (sc->state[ 4])
+#define A5   (sc->state[ 5])
+#define A6   (sc->state[ 6])
+#define A7   (sc->state[ 7])
+#define B0   (sc->state[ 8])
+#define B1   (sc->state[ 9])
+#define B2   (sc->state[10])
+#define B3   (sc->state[11])
+#define B4   (sc->state[12])
+#define B5   (sc->state[13])
+#define B6   (sc->state[14])
+#define B7   (sc->state[15])
+#define C0   (sc->state[16])
+#define C1   (sc->state[17])
+#define C2   (sc->state[18])
+#define C3   (sc->state[19])
+#define C4   (sc->state[20])
+#define C5   (sc->state[21])
+#define C6   (sc->state[22])
+#define C7   (sc->state[23])
+#define D0   (sc->state[24])
+#define D1   (sc->state[25])
+#define D2   (sc->state[26])
+#define D3   (sc->state[27])
+#define D4   (sc->state[28])
+#define D5   (sc->state[29])
+#define D6   (sc->state[30])
+#define D7   (sc->state[31])
+#endif
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	DECL_STATE_BIG
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[32];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_BIG(sc);
+	A0 ^= sph_dec32le_aligned(x +   0);
+	A1 ^= sph_dec32le_aligned(x +   4);
+	A2 ^= sph_dec32le_aligned(x +   8);
+	A3 ^= sph_dec32le_aligned(x +  12);
+	A4 ^= sph_dec32le_aligned(x +  16);
+	A5 ^= sph_dec32le_aligned(x +  20);
+	A6 ^= sph_dec32le_aligned(x +  24);
+	A7 ^= sph_dec32le_aligned(x +  28);
+	B0 ^= sph_dec32le_aligned(x +  32);
+	B1 ^= sph_dec32le_aligned(x +  36);
+	B2 ^= sph_dec32le_aligned(x +  40);
+	B3 ^= sph_dec32le_aligned(x +  44);
+	B4 ^= sph_dec32le_aligned(x +  48);
+	B5 ^= sph_dec32le_aligned(x +  52);
+	B6 ^= sph_dec32le_aligned(x +  56);
+	B7 ^= sph_dec32le_aligned(x +  60);
+	C0 ^= sph_dec32le_aligned(x +  64);
+	C1 ^= sph_dec32le_aligned(x +  68);
+	C2 ^= sph_dec32le_aligned(x +  72);
+	C3 ^= sph_dec32le_aligned(x +  76);
+	C4 ^= sph_dec32le_aligned(x +  80);
+	C5 ^= sph_dec32le_aligned(x +  84);
+	C6 ^= sph_dec32le_aligned(x +  88);
+	C7 ^= sph_dec32le_aligned(x +  92);
+	D0 ^= sph_dec32le_aligned(x +  96);
+	D1 ^= sph_dec32le_aligned(x + 100);
+	D2 ^= sph_dec32le_aligned(x + 104);
+	D3 ^= sph_dec32le_aligned(x + 108);
+	D4 ^= sph_dec32le_aligned(x + 112);
+	D5 ^= sph_dec32le_aligned(x + 116);
+	D6 ^= sph_dec32le_aligned(x + 120);
+	D7 ^= sph_dec32le_aligned(x + 124);
+
+	ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
+	ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
+	ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_BIG(
+		saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		saved[ 8], saved[ 9], saved[10], saved[11],
+		saved[12], saved[13], saved[14], saved[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		saved[16], saved[17], saved[18], saved[19],
+		saved[20], saved[21], saved[22], saved[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		saved[24], saved[25], saved[26], saved[27],
+		saved[28], saved[29], saved[30], saved[31],
+		IF, 25,  4, PP8_0_);
+#else
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+	WRITE_STATE_BIG(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+#endif
+
+#endif
+
+static const u32 IV224[] = {
+	C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
+	C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
+	C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
+	C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
+};
+
+static const u32 IV256[] = {
+	C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
+	C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
+	C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
+	C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
+};
+
+static const u32 IV384[] = {
+	C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
+	C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
+	C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
+	C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
+	C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
+	C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
+	C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
+	C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
+};
+
+static const u32 IV512[] = {
+	C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+	C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+	C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+	C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+	C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+	C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+	C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+	C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
+};
+
+static void
+init_small(void *cc, const u32 *iv)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+init_big(void *cc, const u32 *iv)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+update_small(void *cc, const void *data, size_t len)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_small(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+update_big(void *cc, const void *data, size_t len)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_big(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+encode_count_small(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 9);
+	high = T32(high << 9) + (low >> 23);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+encode_count_big(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 10);
+	high = T32(high << 10) + (low >> 22);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_small_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_small(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_small(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+static void
+finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_big_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_big(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_big(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+void
+sph_simd224_init(void *cc)
+{
+	init_small(cc, IV224);
+}
+
+void
+sph_simd224(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd224_close(void *cc, void *dst)
+{
+	sph_simd224_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 7);
+	sph_simd224_init(cc);
+}
+
+void
+sph_simd256_init(void *cc)
+{
+	init_small(cc, IV256);
+}
+
+void
+sph_simd256(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd256_close(void *cc, void *dst)
+{
+	sph_simd256_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 8);
+	sph_simd256_init(cc);
+}
+
+void
+sph_simd384_init(void *cc)
+{
+	init_big(cc, IV384);
+}
+
+void
+sph_simd384(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd384_close(void *cc, void *dst)
+{
+	sph_simd384_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 12);
+	sph_simd384_init(cc);
+}
+
+void
+sph_simd512_init(void *cc)
+{
+	init_big(cc, IV512);
+}
+
+void
+sph_simd512(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd512_close(void *cc, void *dst)
+{
+	sph_simd512_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 16);
+	sph_simd512_init(cc);
+}
diff --git a/sph/skein.c b/sph/skein.c
new file mode 100644
index 00000000..2fcfae53
--- /dev/null
+++ b/sph/skein.c
@@ -0,0 +1,1244 @@
+/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
+/*
+ * Skein implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_skein.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
+#define SPH_SMALL_FOOTPRINT_SKEIN   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_64
+
+#if 0
+/* obsolete */
+/*
+ * M5_ ## s ## _ ## i  evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3).
+ */
+
+#define M5_0_0    0
+#define M5_0_1    1
+#define M5_0_2    2
+#define M5_0_3    3
+
+#define M5_1_0    1
+#define M5_1_1    2
+#define M5_1_2    3
+#define M5_1_3    4
+
+#define M5_2_0    2
+#define M5_2_1    3
+#define M5_2_2    4
+#define M5_2_3    0
+
+#define M5_3_0    3
+#define M5_3_1    4
+#define M5_3_2    0
+#define M5_3_3    1
+
+#define M5_4_0    4
+#define M5_4_1    0
+#define M5_4_2    1
+#define M5_4_3    2
+
+#define M5_5_0    0
+#define M5_5_1    1
+#define M5_5_2    2
+#define M5_5_3    3
+
+#define M5_6_0    1
+#define M5_6_1    2
+#define M5_6_2    3
+#define M5_6_3    4
+
+#define M5_7_0    2
+#define M5_7_1    3
+#define M5_7_2    4
+#define M5_7_3    0
+
+#define M5_8_0    3
+#define M5_8_1    4
+#define M5_8_2    0
+#define M5_8_3    1
+
+#define M5_9_0    4
+#define M5_9_1    0
+#define M5_9_2    1
+#define M5_9_3    2
+
+#define M5_10_0   0
+#define M5_10_1   1
+#define M5_10_2   2
+#define M5_10_3   3
+
+#define M5_11_0   1
+#define M5_11_1   2
+#define M5_11_2   3
+#define M5_11_3   4
+
+#define M5_12_0   2
+#define M5_12_1   3
+#define M5_12_2   4
+#define M5_12_3   0
+
+#define M5_13_0   3
+#define M5_13_1   4
+#define M5_13_2   0
+#define M5_13_3   1
+
+#define M5_14_0   4
+#define M5_14_1   0
+#define M5_14_2   1
+#define M5_14_3   2
+
+#define M5_15_0   0
+#define M5_15_1   1
+#define M5_15_2   2
+#define M5_15_3   3
+
+#define M5_16_0   1
+#define M5_16_1   2
+#define M5_16_2   3
+#define M5_16_3   4
+
+#define M5_17_0   2
+#define M5_17_1   3
+#define M5_17_2   4
+#define M5_17_3   0
+
+#define M5_18_0   3
+#define M5_18_1   4
+#define M5_18_2   0
+#define M5_18_3   1
+#endif
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#if 0
+/* obsolete */
+#define SKSI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i))
+#define SKST(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+#endif
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#if 0
+/* obsolete */
+#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2)   do { \
+		k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+#endif
+
+#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2)   do { \
+		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+			^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKSI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \
+		w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \
+		w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_ADDKEY(s, tt0, tt1)   do { \
+		p0 = SPH_T64(p0 + h[s + 0]); \
+		p1 = SPH_T64(p1 + h[s + 1]); \
+		p2 = SPH_T64(p2 + h[s + 2]); \
+		p3 = SPH_T64(p3 + h[s + 3]); \
+		p4 = SPH_T64(p4 + h[s + 4]); \
+		p5 = SPH_T64(p5 + h[s + 5] + tt0); \
+		p6 = SPH_T64(p6 + h[s + 6] + tt1); \
+		p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \
+	} while (0)
+
+#else
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKBI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKBI(k, s, 1)); \
+		w2 = SPH_T64(w2 + SKBI(k, s, 2)); \
+		w3 = SPH_T64(w3 + SKBI(k, s, 3)); \
+		w4 = SPH_T64(w4 + SKBI(k, s, 4)); \
+		w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+#endif
+
+#define TFBIG_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1)  do { \
+		TFSMALL_MIX(w0, w1, rc0); \
+		TFSMALL_MIX(w2, w3, rc1); \
+	} while (0)
+#endif
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+		TFBIG_MIX(w0, w1, rc0); \
+		TFBIG_MIX(w2, w3, rc1); \
+		TFBIG_MIX(w4, w5, rc2); \
+		TFBIG_MIX(w6, w7, rc3); \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_4e(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \
+		TFSMALL_MIX4(p0, p3, p2, p1,  5, 37); \
+	} while (0)
+
+#define TFSMALL_4o(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(s, t0, t1); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(s, t1, t2); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#else
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define UBI_SMALL(etype, extra)  do { \
+		sph_u64 h4, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le(buf +  0); \
+		sph_u64 m1 = sph_dec64le(buf +  8); \
+		sph_u64 m2 = sph_dec64le(buf + 16); \
+		sph_u64 m3 = sph_dec64le(buf + 24); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \
+		t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \
+		TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \
+		TFSMALL_4e(0); \
+		TFSMALL_4o(1); \
+		TFSMALL_4e(2); \
+		TFSMALL_4o(3); \
+		TFSMALL_4e(4); \
+		TFSMALL_4o(5); \
+		TFSMALL_4e(6); \
+		TFSMALL_4o(7); \
+		TFSMALL_4e(8); \
+		TFSMALL_4o(9); \
+		TFSMALL_4e(10); \
+		TFSMALL_4o(11); \
+		TFSMALL_4e(12); \
+		TFSMALL_4o(13); \
+		TFSMALL_4e(14); \
+		TFSMALL_4o(15); \
+		TFSMALL_4e(16); \
+		TFSMALL_4o(17); \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 t0, t1, t2; \
+		unsigned u; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \
+			h[6], h[7], h[8], t0, t1, t2); \
+		for (u = 0; u <= 15; u += 3) { \
+			h[u +  9] = h[u + 0]; \
+			h[u + 10] = h[u + 1]; \
+			h[u + 11] = h[u + 2]; \
+		} \
+		for (u = 0; u < 9; u ++) { \
+			sph_u64 s = u << 1; \
+			sph_u64 tmp; \
+			TFBIG_4e(s); \
+			TFBIG_4o(s + 1); \
+			tmp = t2; \
+			t2 = t1; \
+			t1 = t0; \
+			t0 = tmp; \
+		} \
+		TFBIG_ADDKEY(18, t0, t1); \
+		h[0] = m0 ^ p0; \
+		h[1] = m1 ^ p1; \
+		h[2] = m2 ^ p2; \
+		h[3] = m3 ^ p3; \
+		h[4] = m4 ^ p4; \
+		h[5] = m5 ^ p5; \
+		h[6] = m6 ^ p6; \
+		h[7] = m7 ^ p7; \
+	} while (0)
+
+#else
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 h8, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+		TFBIG_4e(0); \
+		TFBIG_4o(1); \
+		TFBIG_4e(2); \
+		TFBIG_4o(3); \
+		TFBIG_4e(4); \
+		TFBIG_4o(5); \
+		TFBIG_4e(6); \
+		TFBIG_4o(7); \
+		TFBIG_4e(8); \
+		TFBIG_4o(9); \
+		TFBIG_4e(10); \
+		TFBIG_4o(11); \
+		TFBIG_4e(12); \
+		TFBIG_4o(13); \
+		TFBIG_4e(14); \
+		TFBIG_4o(15); \
+		TFBIG_4e(16); \
+		TFBIG_4o(17); \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+		h4 = m4 ^ p4; \
+		h5 = m5 ^ p5; \
+		h6 = m6 ^ p6; \
+		h7 = m7 ^ p7; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define DECL_STATE_SMALL \
+	sph_u64 h0, h1, h2, h3; \
+	sph_u64 bcount;
+
+#define READ_STATE_SMALL(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		sc->bcount = bcount; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define DECL_STATE_BIG \
+	sph_u64 h[27]; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h[0] = (sc)->h0; \
+		h[1] = (sc)->h1; \
+		h[2] = (sc)->h2; \
+		h[3] = (sc)->h3; \
+		h[4] = (sc)->h4; \
+		h[5] = (sc)->h5; \
+		h[6] = (sc)->h6; \
+		h[7] = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h[0]; \
+		(sc)->h1 = h[1]; \
+		(sc)->h2 = h[2]; \
+		(sc)->h3 = h[3]; \
+		(sc)->h4 = h[4]; \
+		(sc)->h5 = h[5]; \
+		(sc)->h6 = h[6]; \
+		(sc)->h7 = h[7]; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_BIG \
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		h4 = (sc)->h4; \
+		h5 = (sc)->h5; \
+		h6 = (sc)->h6; \
+		h7 = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		(sc)->h4 = h4; \
+		(sc)->h5 = h5; \
+		(sc)->h6 = h6; \
+		(sc)->h7 = h7; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+static void
+skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+#endif
+
+static void
+skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->h4 = iv[4];
+	sc->h5 = iv[5];
+	sc->h6 = iv[6];
+	sc->h7 = iv[7];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_core(sph_skein_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr, clen;
+	unsigned first;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	clen = (sizeof sc->buf) - ptr;
+	if (len <= clen) {
+		memcpy(buf + ptr, data, len);
+		sc->ptr = ptr + len;
+		return;
+	}
+	if (clen != 0) {
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		first = 0;
+		memcpy(buf, data, sizeof sc->buf);
+		data = (const unsigned char *)data + sizeof sc->buf;
+		len -= sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(buf, data, len);
+
+#else
+
+	/*
+	 * Unrolling the loop yields a slight performance boost, while
+	 * keeping the code size aorund 24 kB on 32-bit x86.
+	 */
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		buf = (unsigned char *)data;
+		bcount ++;
+		UBI_SMALL(96, 0);
+		if (len <= 2 * sizeof sc->buf) {
+			data = buf + sizeof sc->buf;
+			len -= sizeof sc->buf;
+			break;
+		}
+		buf += sizeof sc->buf;
+		data = buf + sizeof sc->buf;
+		first = 0;
+		len -= 2 * sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(sc->buf, data, len);
+
+#endif
+}
+#endif
+
+static void
+skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
+{
+	/*
+	 * The Skein "final bit" in the tweak is troublesome here,
+	 * because if the input has a length which is a multiple of the
+	 * block size (512 bits) then that bit must be set for the
+	 * final block, which is full of message bits (padding in
+	 * Skein can be reduced to no extra bit at all). However, this
+	 * function cannot know whether it processes the last chunks of
+	 * the message or not. Hence we may keep a full block of buffered
+	 * data (64 bytes).
+	 */
+	unsigned char *buf;
+	size_t ptr;
+	unsigned first;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len <= (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	first = (bcount == 0) << 7;
+	do {
+		size_t clen;
+
+		if (ptr == sizeof sc->buf) {
+			bcount ++;
+			UBI_BIG(96 + first, 0);
+			first = 0;
+			ptr = 0;
+		}
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	} while (len > 0);
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+	DECL_STATE_SMALL
+
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_small_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	READ_STATE_SMALL(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_SMALL(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	memcpy(dst, buf, out_len);
+}
+#endif
+
+static void
+skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+#if SPH_SMALL_FOOTPRINT_SKEIN
+	size_t u;
+#endif
+	DECL_STATE_BIG
+
+	/*
+	 * Add bit padding if necessary.
+	 */
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_big_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * At that point, if ptr == 0, then the message was empty;
+	 * otherwise, there is between 1 and 64 bytes (inclusive) which
+	 * are yet to be processed. Either way, we complete the buffer
+	 * to a full block with zeros (the Skein specification mandates
+	 * that an empty message is padded so that there is at least
+	 * one block to process).
+	 *
+	 * Once this block has been processed, we do it again, with
+	 * a block full of zeros, for the output (that block contains
+	 * the encoding of "0", over 8 bytes, then padded with zeros).
+	 */
+	READ_STATE_BIG(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_BIG(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	/*
+	 * We use a temporary buffer because we must support the case
+	 * where output size is not a multiple of 64 (namely, a 224-bit
+	 * output).
+	 */
+	for (u = 0; u < out_len; u += 8)
+		sph_enc64le_aligned(buf + u, h[u >> 3]);
+	memcpy(dst, buf, out_len);
+
+#else
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	sph_enc64le_aligned(buf + 32, h4);
+	sph_enc64le_aligned(buf + 40, h5);
+	sph_enc64le_aligned(buf + 48, h6);
+	sph_enc64le_aligned(buf + 56, h7);
+	memcpy(dst, buf, out_len);
+
+#endif
+}
+
+#if 0
+/* obsolete */
+static const sph_u64 IV224[] = {
+	SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C),
+	SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833),
+	SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69)
+};
+#endif
+
+static const sph_u64 IV224[] = {
+	SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF),
+	SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4),
+	SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D),
+	SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4),
+	SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA),
+	SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407),
+	SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+
+#if 0
+/* obsolete */
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_small_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_small_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+#endif
+
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_big_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_big_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_init(void *cc)
+{
+	skein_big_init(cc, IV384);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_close(void *cc, void *dst)
+{
+	sph_skein384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 48);
+	sph_skein384_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_init(void *cc)
+{
+	skein_big_init(cc, IV512);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_close(void *cc, void *dst)
+{
+	sph_skein512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 64);
+	sph_skein512_init(cc);
+}
+
+#endif
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
new file mode 100644
index 00000000..c3829cad
--- /dev/null
+++ b/sph/sph_blake.h
@@ -0,0 +1,319 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#endif
diff --git a/sph/sph_bmw.h b/sph/sph_bmw.h
new file mode 100644
index 00000000..484a2a74
--- /dev/null
+++ b/sph/sph_bmw.h
@@ -0,0 +1,320 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_bmw_small_context;
+
+/**
+ * This structure is a context for BMW-224 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw224_context;
+
+/**
+ * This structure is a context for BMW-256 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} sph_bmw_big_context;
+
+/**
+ * This structure is a context for BMW-384 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw384_context;
+
+/**
+ * This structure is a context for BMW-512 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw512_context;
+
+#endif
+
+/**
+ * Initialize a BMW-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-224 context (pointer to a
+ *             <code>sph_bmw224_context</code>)
+ */
+void sph_bmw224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-256 context (pointer to a
+ *             <code>sph_bmw256_context</code>)
+ */
+void sph_bmw256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BMW-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-384 context (pointer to a
+ *             <code>sph_bmw384_context</code>)
+ */
+void sph_bmw384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-512 context (pointer to a
+ *             <code>sph_bmw512_context</code>)
+ */
+void sph_bmw512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#endif
diff --git a/sph/sph_cubehash.h b/sph/sph_cubehash.h
new file mode 100644
index 00000000..c6636748
--- /dev/null
+++ b/sph/sph_cubehash.h
@@ -0,0 +1,285 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_echo.h b/sph/sph_echo.h
new file mode 100644
index 00000000..ff2ba2ec
--- /dev/null
+++ b/sph/sph_echo.h
@@ -0,0 +1,312 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_groestl.h b/sph/sph_groestl.h
new file mode 100644
index 00000000..28832f91
--- /dev/null
+++ b/sph/sph_groestl.h
@@ -0,0 +1,321 @@
+/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Groestl interface. This code implements Groestl with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_groestl.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_GROESTL_H__
+#define SPH_GROESTL_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Groestl-224.
+ */
+#define SPH_SIZE_groestl224   224
+
+/**
+ * Output size (in bits) for Groestl-256.
+ */
+#define SPH_SIZE_groestl256   256
+
+/**
+ * Output size (in bits) for Groestl-384.
+ */
+#define SPH_SIZE_groestl384   384
+
+/**
+ * Output size (in bits) for Groestl-512.
+ */
+#define SPH_SIZE_groestl512   512
+
+/**
+ * This structure is a context for Groestl-224 and Groestl-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[8];
+#endif
+		sph_u32 narrow[16];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_small_context;
+
+/**
+ * This structure is a context for Groestl-224 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl224_context;
+
+/**
+ * This structure is a context for Groestl-256 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl256_context;
+
+/**
+ * This structure is a context for Groestl-384 and Groestl-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_big_context;
+
+/**
+ * This structure is a context for Groestl-384 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl384_context;
+
+/**
+ * This structure is a context for Groestl-512 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl512_context;
+
+/**
+ * Initialize a Groestl-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-224 context (pointer to a
+ *             <code>sph_groestl224_context</code>)
+ */
+void sph_groestl224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-256 context (pointer to a
+ *             <code>sph_groestl256_context</code>)
+ */
+void sph_groestl256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-384 context (pointer to a
+ *             <code>sph_groestl384_context</code>)
+ */
+void sph_groestl384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-512 context (pointer to a
+ *             <code>sph_groestl512_context</code>)
+ */
+void sph_groestl512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_jh.h b/sph/sph_jh.h
new file mode 100644
index 00000000..02684061
--- /dev/null
+++ b/sph/sph_jh.h
@@ -0,0 +1,290 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_JH_H__
+#define SPH_JH_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for JH-224.
+ */
+#define SPH_SIZE_jh224   224
+
+/**
+ * Output size (in bits) for JH-256.
+ */
+#define SPH_SIZE_jh256   256
+
+/**
+ * Output size (in bits) for JH-384.
+ */
+#define SPH_SIZE_jh384   384
+
+/**
+ * Output size (in bits) for JH-512.
+ */
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} H;
+#if SPH_64
+	sph_u64 block_count;
+#else
+	sph_u32 block_count_high, block_count_low;
+#endif
+#endif
+} sph_jh_context;
+
+/**
+ * Type for a JH-224 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh224_context;
+
+/**
+ * Type for a JH-256 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh256_context;
+
+/**
+ * Type for a JH-384 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh384_context;
+
+/**
+ * Type for a JH-512 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh512_context;
+
+/**
+ * Initialize a JH-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-224 context (pointer to a
+ *             <code>sph_jh224_context</code>)
+ */
+void sph_jh224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param dst   the destination buffer
+ */
+void sph_jh224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-256 context (pointer to a
+ *             <code>sph_jh256_context</code>)
+ */
+void sph_jh256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param dst   the destination buffer
+ */
+void sph_jh256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-384 context (pointer to a
+ *             <code>sph_jh384_context</code>)
+ */
+void sph_jh384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param dst   the destination buffer
+ */
+void sph_jh384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-512 context (pointer to a
+ *             <code>sph_jh512_context</code>)
+ */
+void sph_jh512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param dst   the destination buffer
+ */
+void sph_jh512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_keccak.h b/sph/sph_keccak.h
new file mode 100644
index 00000000..6a719bde
--- /dev/null
+++ b/sph/sph_keccak.h
@@ -0,0 +1,285 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_KECCAK_H__
+#define SPH_KECCAK_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Keccak-224.
+ */
+#define SPH_SIZE_keccak224   224
+
+/**
+ * Output size (in bits) for Keccak-256.
+ */
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-384.
+ */
+#define SPH_SIZE_keccak384   384
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[144];    /* first field, for alignment */
+	size_t ptr, lim;
+	union {
+#if SPH_64
+		sph_u64 wide[25];
+#endif
+		sph_u32 narrow[50];
+	} u;
+#endif
+} sph_keccak_context;
+
+/**
+ * Type for a Keccak-224 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak224_context;
+
+/**
+ * Type for a Keccak-256 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak256_context;
+
+/**
+ * Type for a Keccak-384 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak384_context;
+
+/**
+ * Type for a Keccak-512 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak512_context;
+
+/**
+ * Initialize a Keccak-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-224 context (pointer to a
+ *             <code>sph_keccak224_context</code>)
+ */
+void sph_keccak224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-256 context (pointer to a
+ *             <code>sph_keccak256_context</code>)
+ */
+void sph_keccak256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-384 context (pointer to a
+ *             <code>sph_keccak384_context</code>)
+ */
+void sph_keccak384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-512 context (pointer to a
+ *             <code>sph_keccak512_context</code>)
+ */
+void sph_keccak512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_luffa.h b/sph/sph_luffa.h
new file mode 100644
index 00000000..7d628391
--- /dev/null
+++ b/sph/sph_luffa.h
@@ -0,0 +1,288 @@
+/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * Luffa interface. Luffa is a family of functions which differ by
+ * their output size; this implementation defines Luffa for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_luffa.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_LUFFA_H__
+#define SPH_LUFFA_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Luffa-224.
+ */
+#define SPH_SIZE_luffa224   224
+
+/**
+ * Output size (in bits) for Luffa-256.
+ */
+#define SPH_SIZE_luffa256   256
+
+/**
+ * Output size (in bits) for Luffa-384.
+ */
+#define SPH_SIZE_luffa384   384
+
+/**
+ * Output size (in bits) for Luffa-512.
+ */
+#define SPH_SIZE_luffa512   512
+
+/**
+ * This structure is a context for Luffa-224 computations: it contains
+ * the intermediate values and some data from the last entered block.
+ * Once a Luffa computation has been performed, the context can be
+ * reused for another computation.
+ *
+ * The contents of this structure are private. A running Luffa
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_luffa224_context;
+
+/**
+ * This structure is a context for Luffa-256 computations. It is
+ * identical to <code>sph_luffa224_context</code>.
+ */
+typedef sph_luffa224_context sph_luffa256_context;
+
+/**
+ * This structure is a context for Luffa-384 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[4][8];
+#endif
+} sph_luffa384_context;
+
+/**
+ * This structure is a context for Luffa-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_luffa512_context;
+
+/**
+ * Initialize a Luffa-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-224 context (pointer to a
+ *             <code>sph_luffa224_context</code>)
+ */
+void sph_luffa224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-256 context (pointer to a
+ *             <code>sph_luffa256_context</code>)
+ */
+void sph_luffa256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-384 context (pointer to a
+ *             <code>sph_luffa384_context</code>)
+ */
+void sph_luffa384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-512 context (pointer to a
+ *             <code>sph_luffa512_context</code>)
+ */
+void sph_luffa512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_shavite.h b/sph/sph_shavite.h
new file mode 100644
index 00000000..76cc8f10
--- /dev/null
+++ b/sph/sph_shavite.h
@@ -0,0 +1,306 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[8];
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[16];
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-512 context (pointer to a
+ *             <code>sph_shavite512_context</code>)
+ */
+void sph_shavite512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_simd.h b/sph/sph_simd.h
new file mode 100644
index 00000000..8f1e0493
--- /dev/null
+++ b/sph/sph_simd.h
@@ -0,0 +1,302 @@
+/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * SIMD interface. SIMD is a family of functions which differ by
+ * their output size; this implementation defines SIMD for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_simd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SIMD_H__
+#define SPH_SIMD_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SIMD-224.
+ */
+#define SPH_SIZE_simd224   224
+
+/**
+ * Output size (in bits) for SIMD-256.
+ */
+#define SPH_SIZE_simd256   256
+
+/**
+ * Output size (in bits) for SIMD-384.
+ */
+#define SPH_SIZE_simd384   384
+
+/**
+ * Output size (in bits) for SIMD-512.
+ */
+#define SPH_SIZE_simd512   512
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-224
+ * and SIMD-256.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[16];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_small_context;
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-384
+ * and SIMD-512.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_big_context;
+
+/**
+ * Type for a SIMD-224 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd224_context;
+
+/**
+ * Type for a SIMD-256 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd256_context;
+
+/**
+ * Type for a SIMD-384 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd384_context;
+
+/**
+ * Type for a SIMD-512 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd512_context;
+
+/**
+ * Initialize an SIMD-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-224 context (pointer to a
+ *             <code>sph_simd224_context</code>)
+ */
+void sph_simd224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param dst   the destination buffer
+ */
+void sph_simd224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-256 context (pointer to a
+ *             <code>sph_simd256_context</code>)
+ */
+void sph_simd256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param dst   the destination buffer
+ */
+void sph_simd256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-384 context (pointer to a
+ *             <code>sph_simd384_context</code>)
+ */
+void sph_simd384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param dst   the destination buffer
+ */
+void sph_simd384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-512 context (pointer to a
+ *             <code>sph_simd512_context</code>)
+ */
+void sph_simd512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param dst   the destination buffer
+ */
+void sph_simd512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
diff --git a/sph/sph_skein.h b/sph/sph_skein.h
new file mode 100644
index 00000000..85559843
--- /dev/null
+++ b/sph/sph_skein.h
@@ -0,0 +1,290 @@
+/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
+/**
+ * Skein interface. The Skein specification defines three main
+ * functions, called Skein-256, Skein-512 and Skein-1024, which can be
+ * further parameterized with an output length. For the SHA-3
+ * competition, Skein-512 is used for output sizes of 224, 256, 384 and
+ * 512 bits; this is what this code implements. Thus, we hereafter call
+ * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
+ * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
+ * and Skein-512-512, respectively.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_skein.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SKEIN_H__
+#define SPH_SKEIN_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for Skein-224.
+ */
+#define SPH_SIZE_skein224   224
+
+/**
+ * Output size (in bits) for Skein-256.
+ */
+#define SPH_SIZE_skein256   256
+
+/**
+ * Output size (in bits) for Skein-384.
+ */
+#define SPH_SIZE_skein384   384
+
+/**
+ * Output size (in bits) for Skein-512.
+ */
+#define SPH_SIZE_skein512   512
+
+/**
+ * This structure is a context for Skein computations (with a 384- or
+ * 512-bit output): it contains the intermediate values and some data
+ * from the last entered block. Once a Skein computation has been
+ * performed, the context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Skein computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+	sph_u64 bcount;
+#endif
+} sph_skein_big_context;
+
+/**
+ * Type for a Skein-224 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein224_context;
+
+/**
+ * Type for a Skein-256 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein256_context;
+
+/**
+ * Type for a Skein-384 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein384_context;
+
+/**
+ * Type for a Skein-512 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein512_context;
+
+/**
+ * Initialize a Skein-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-224 context (pointer to a
+ *             <code>sph_skein224_context</code>)
+ */
+void sph_skein224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param dst   the destination buffer
+ */
+void sph_skein224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-256 context (pointer to a
+ *             <code>sph_skein256_context</code>)
+ */
+void sph_skein256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param dst   the destination buffer
+ */
+void sph_skein256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-384 context (pointer to a
+ *             <code>sph_skein384_context</code>)
+ */
+void sph_skein384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param dst   the destination buffer
+ */
+void sph_skein384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-512 context (pointer to a
+ *             <code>sph_skein512_context</code>)
+ */
+void sph_skein512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param dst   the destination buffer
+ */
+void sph_skein512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#endif
diff --git a/sph/sph_types.h b/sph/sph_types.h
new file mode 100644
index 00000000..7295b0b3
--- /dev/null
+++ b/sph/sph_types.h
@@ -0,0 +1,1976 @@
+/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
+/**
+ * Basic type definitions.
+ *
+ * This header file defines the generic integer types that will be used
+ * for the implementation of hash functions; it also contains helper
+ * functions which encode and decode multi-byte integer values, using
+ * either little-endian or big-endian conventions.
+ *
+ * This file contains a compile-time test on the size of a byte
+ * (the <code>unsigned char</code> C type). If bytes are not octets,
+ * i.e. if they do not have a size of exactly 8 bits, then compilation
+ * is aborted. Architectures where bytes are not octets are relatively
+ * rare, even in the embedded devices market. We forbid non-octet bytes
+ * because there is no clear convention on how octet streams are encoded
+ * on such systems.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_types.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_TYPES_H__
+#define SPH_TYPES_H__
+
+#include <limits.h>
+
+/*
+ * All our I/O functions are defined over octet streams. We do not know
+ * how to handle input data if bytes are not octets.
+ */
+#if CHAR_BIT != 8
+#error This code requires 8-bit bytes
+#endif
+
+/* ============= BEGIN documentation block for Doxygen ============ */
+
+#ifdef DOXYGEN_IGNORE
+
+/** @mainpage sphlib C code documentation
+ *
+ * @section overview Overview
+ *
+ * <code>sphlib</code> is a library which contains implementations of
+ * various cryptographic hash functions. These pages have been generated
+ * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
+ * document the API for the C implementations.
+ *
+ * The API is described in appropriate header files, which are available
+ * in the "Files" section. Each hash function family has its own header,
+ * whose name begins with <code>"sph_"</code> and contains the family
+ * name. For instance, the API for the RIPEMD hash functions is available
+ * in the header file <code>sph_ripemd.h</code>.
+ *
+ * @section principles API structure and conventions
+ *
+ * @subsection io Input/output conventions
+ *
+ * In all generality, hash functions operate over strings of bits.
+ * Individual bits are rarely encountered in C programming or actual
+ * communication protocols; most protocols converge on the ubiquitous
+ * "octet" which is a group of eight bits. Data is thus expressed as a
+ * stream of octets. The C programming language contains the notion of a
+ * "byte", which is a data unit managed under the type <code>"unsigned
+ * char"</code>. The C standard prescribes that a byte should hold at
+ * least eight bits, but possibly more. Most modern architectures, even
+ * in the embedded world, feature eight-bit bytes, i.e. map bytes to
+ * octets.
+ *
+ * Nevertheless, for some of the implemented hash functions, an extra
+ * API has been added, which allows the input of arbitrary sequences of
+ * bits: when the computation is about to be closed, 1 to 7 extra bits
+ * can be added. The functions for which this API is implemented include
+ * the SHA-2 functions and all SHA-3 candidates.
+ *
+ * <code>sphlib</code> defines hash function which may hash octet streams,
+ * i.e. streams of bits where the number of bits is a multiple of eight.
+ * The data input functions in the <code>sphlib</code> API expect data
+ * as anonymous pointers (<code>"const void *"</code>) with a length
+ * (of type <code>"size_t"</code>) which gives the input data chunk length
+ * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
+ * header contains a compile-time test which prevents compilation on
+ * architectures where this property is not met.
+ *
+ * The hash function output is also converted into bytes. All currently
+ * implemented hash functions have an output width which is a multiple of
+ * eight, and this is likely to remain true for new designs.
+ *
+ * Most hash functions internally convert input data into 32-bit of 64-bit
+ * words, using either little-endian or big-endian conversion. The hash
+ * output also often consists of such words, which are encoded into output
+ * bytes with a similar endianness convention. Some hash functions have
+ * been only loosely specified on that subject; when necessary,
+ * <code>sphlib</code> has been tested against published "reference"
+ * implementations in order to use the same conventions.
+ *
+ * @subsection shortname Function short name
+ *
+ * Each implemented hash function has a "short name" which is used
+ * internally to derive the identifiers for the functions and context
+ * structures which the function uses. For instance, MD5 has the short
+ * name <code>"md5"</code>. Short names are listed in the next section,
+ * for the implemented hash functions. In subsequent sections, the
+ * short name will be assumed to be <code>"XXX"</code>: replace with the
+ * actual hash function name to get the C identifier.
+ *
+ * Note: some functions within the same family share the same core
+ * elements, such as update function or context structure. Correspondingly,
+ * some of the defined types or functions may actually be macros which
+ * transparently evaluate to another type or function name.
+ *
+ * @subsection context Context structure
+ *
+ * Each implemented hash fonction has its own context structure, available
+ * under the type name <code>"sph_XXX_context"</code> for the hash function
+ * with short name <code>"XXX"</code>. This structure holds all needed
+ * state for a running hash computation.
+ *
+ * The contents of these structures are meant to be opaque, and private
+ * to the implementation. However, these contents are specified in the
+ * header files so that application code which uses <code>sphlib</code>
+ * may access the size of those structures.
+ *
+ * The caller is responsible for allocating the context structure,
+ * whether by dynamic allocation (<code>malloc()</code> or equivalent),
+ * static allocation (a global permanent variable), as an automatic
+ * variable ("on the stack"), or by any other mean which ensures proper
+ * structure alignment. <code>sphlib</code> code performs no dynamic
+ * allocation by itself.
+ *
+ * The context must be initialized before use, using the
+ * <code>sph_XXX_init()</code> function. This function sets the context
+ * state to proper initial values for hashing.
+ *
+ * Since all state data is contained within the context structure,
+ * <code>sphlib</code> is thread-safe and reentrant: several hash
+ * computations may be performed in parallel, provided that they do not
+ * operate on the same context. Moreover, a running computation can be
+ * cloned by copying the context (with a simple <code>memcpy()</code>):
+ * the context and its clone are then independant and may be updated
+ * with new data and/or closed without interfering with each other.
+ * Similarly, a context structure can be moved in memory at will:
+ * context structures contain no pointer, in particular no pointer to
+ * themselves.
+ *
+ * @subsection dataio Data input
+ *
+ * Hashed data is input with the <code>sph_XXX()</code> fonction, which
+ * takes as parameters a pointer to the context, a pointer to the data
+ * to hash, and the number of data bytes to hash. The context is updated
+ * with the new data.
+ *
+ * Data can be input in one or several calls, with arbitrary input lengths.
+ * However, it is best, performance wise, to input data by relatively big
+ * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
+ * optimize things and avoid internal copying.
+ *
+ * When all data has been input, the context can be closed with
+ * <code>sph_XXX_close()</code>. The hash output is computed and written
+ * into the provided buffer. The caller must take care to provide a
+ * buffer of appropriate length; e.g., when using SHA-1, the output is
+ * a 20-byte word, therefore the output buffer must be at least 20-byte
+ * long.
+ *
+ * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
+ * function can be used instead of <code>sph_XXX_close()</code>. This
+ * function can take a few extra <strong>bits</strong> to be added at
+ * the end of the input message. This allows hashing messages with a
+ * bit length which is not a multiple of 8. The extra bits are provided
+ * as an unsigned integer value, and a bit count. The bit count must be
+ * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
+ * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
+ * For instance, to add three bits of value 1, 1 and 0, the unsigned
+ * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
+ * will be 3.
+ *
+ * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
+ * it evaluates to the function output size, expressed in bits. For instance,
+ * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
+ *
+ * When closed, the context is automatically reinitialized and can be
+ * immediately used for another computation. It is not necessary to call
+ * <code>sph_XXX_init()</code> after a close. Note that
+ * <code>sph_XXX_init()</code> can still be called to "reset" a context,
+ * i.e. forget previously input data, and get back to the initial state.
+ *
+ * @subsection alignment Data alignment
+ *
+ * "Alignment" is a property of data, which is said to be "properly
+ * aligned" when its emplacement in memory is such that the data can
+ * be optimally read by full words. This depends on the type of access;
+ * basically, some hash functions will read data by 32-bit or 64-bit
+ * words. <code>sphlib</code> does not mandate such alignment for input
+ * data, but using aligned data can substantially improve performance.
+ *
+ * As a rule, it is best to input data by chunks whose length (in bytes)
+ * is a multiple of eight, and which begins at "generally aligned"
+ * addresses, such as the base address returned by a call to
+ * <code>malloc()</code>.
+ *
+ * @section functions Implemented functions
+ *
+ * We give here the list of implemented functions. They are grouped by
+ * family; to each family corresponds a specific header file. Each
+ * individual function has its associated "short name". Please refer to
+ * the documentation for that header file to get details on the hash
+ * function denomination and provenance.
+ *
+ * Note: the functions marked with a '(64)' in the list below are
+ * available only if the C compiler provides an integer type of length
+ * 64 bits or more. Such a type is mandatory in the latest C standard
+ * (ISO 9899:1999, aka "C99") and is present in several older compilers
+ * as well, so chances are that such a type is available.
+ *
+ * - HAVAL family: file <code>sph_haval.h</code>
+ *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
+ *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
+ *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
+ *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
+ *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
+ *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
+ *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
+ *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
+ *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
+ *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
+ *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
+ *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
+ *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
+ *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
+ *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
+ * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
+ * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
+ * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
+ * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
+ * - RadioGatun family: file <code>sph_radiogatun.h</code>
+ *   - RadioGatun[32]: short name: <code>radiogatun32</code>
+ *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
+ * - RIPEMD family: file <code>sph_ripemd.h</code>
+ *   - RIPEMD: short name: <code>ripemd</code>
+ *   - RIPEMD-128: short name: <code>ripemd128</code>
+ *   - RIPEMD-160: short name: <code>ripemd160</code>
+ * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
+ * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
+ * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
+ *   - SHA-224: short name: <code>sha224</code>
+ *   - SHA-256: short name: <code>sha256</code>
+ *   - SHA-384: short name: <code>sha384</code> (64)
+ *   - SHA-512: short name: <code>sha512</code> (64)
+ * - Tiger family: file <code>sph_tiger.h</code>
+ *   - Tiger: short name: <code>tiger</code> (64)
+ *   - Tiger2: short name: <code>tiger2</code> (64)
+ * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
+ *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
+ *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
+ *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
+ *
+ * The fourteen second-round SHA-3 candidates are also implemented;
+ * when applicable, the implementations follow the "final" specifications
+ * as published for the third round of the SHA-3 competition (BLAKE,
+ * Groestl, JH, Keccak and Skein have been tweaked for third round).
+ *
+ * - BLAKE family: file <code>sph_blake.h</code>
+ *   - BLAKE-224: short name: <code>blake224</code>
+ *   - BLAKE-256: short name: <code>blake256</code>
+ *   - BLAKE-384: short name: <code>blake384</code>
+ *   - BLAKE-512: short name: <code>blake512</code>
+ * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
+ *   - BMW-224: short name: <code>bmw224</code>
+ *   - BMW-256: short name: <code>bmw256</code>
+ *   - BMW-384: short name: <code>bmw384</code> (64)
+ *   - BMW-512: short name: <code>bmw512</code> (64)
+ * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
+ *   CubeHash16/32 in the CubeHash specification)
+ *   - CubeHash-224: short name: <code>cubehash224</code>
+ *   - CubeHash-256: short name: <code>cubehash256</code>
+ *   - CubeHash-384: short name: <code>cubehash384</code>
+ *   - CubeHash-512: short name: <code>cubehash512</code>
+ * - ECHO family: file <code>sph_echo.h</code>
+ *   - ECHO-224: short name: <code>echo224</code>
+ *   - ECHO-256: short name: <code>echo256</code>
+ *   - ECHO-384: short name: <code>echo384</code>
+ *   - ECHO-512: short name: <code>echo512</code>
+ * - Fugue family: file <code>sph_fugue.h</code>
+ *   - Fugue-224: short name: <code>fugue224</code>
+ *   - Fugue-256: short name: <code>fugue256</code>
+ *   - Fugue-384: short name: <code>fugue384</code>
+ *   - Fugue-512: short name: <code>fugue512</code>
+ * - Groestl family: file <code>sph_groestl.h</code>
+ *   - Groestl-224: short name: <code>groestl224</code>
+ *   - Groestl-256: short name: <code>groestl256</code>
+ *   - Groestl-384: short name: <code>groestl384</code>
+ *   - Groestl-512: short name: <code>groestl512</code>
+ * - Hamsi family: file <code>sph_hamsi.h</code>
+ *   - Hamsi-224: short name: <code>hamsi224</code>
+ *   - Hamsi-256: short name: <code>hamsi256</code>
+ *   - Hamsi-384: short name: <code>hamsi384</code>
+ *   - Hamsi-512: short name: <code>hamsi512</code>
+ * - JH family: file <code>sph_jh.h</code>
+ *   - JH-224: short name: <code>jh224</code>
+ *   - JH-256: short name: <code>jh256</code>
+ *   - JH-384: short name: <code>jh384</code>
+ *   - JH-512: short name: <code>jh512</code>
+ * - Keccak family: file <code>sph_keccak.h</code>
+ *   - Keccak-224: short name: <code>keccak224</code>
+ *   - Keccak-256: short name: <code>keccak256</code>
+ *   - Keccak-384: short name: <code>keccak384</code>
+ *   - Keccak-512: short name: <code>keccak512</code>
+ * - Luffa family: file <code>sph_luffa.h</code>
+ *   - Luffa-224: short name: <code>luffa224</code>
+ *   - Luffa-256: short name: <code>luffa256</code>
+ *   - Luffa-384: short name: <code>luffa384</code>
+ *   - Luffa-512: short name: <code>luffa512</code>
+ * - Shabal family: file <code>sph_shabal.h</code>
+ *   - Shabal-192: short name: <code>shabal192</code>
+ *   - Shabal-224: short name: <code>shabal224</code>
+ *   - Shabal-256: short name: <code>shabal256</code>
+ *   - Shabal-384: short name: <code>shabal384</code>
+ *   - Shabal-512: short name: <code>shabal512</code>
+ * - SHAvite-3 family: file <code>sph_shavite.h</code>
+ *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
+ *     short name: <code>shabal224</code>
+ *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
+ *     short name: <code>shabal256</code>
+ *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
+ *     short name: <code>shabal384</code>
+ *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
+ *     short name: <code>shabal512</code>
+ * - SIMD family: file <code>sph_simd.h</code>
+ *   - SIMD-224: short name: <code>simd224</code>
+ *   - SIMD-256: short name: <code>simd256</code>
+ *   - SIMD-384: short name: <code>simd384</code>
+ *   - SIMD-512: short name: <code>simd512</code>
+ * - Skein family: file <code>sph_skein.h</code>
+ *   - Skein-224 (nominally specified as Skein-512-224): short name:
+ *     <code>skein224</code> (64)
+ *   - Skein-256 (nominally specified as Skein-512-256): short name:
+ *     <code>skein256</code> (64)
+ *   - Skein-384 (nominally specified as Skein-512-384): short name:
+ *     <code>skein384</code> (64)
+ *   - Skein-512 (nominally specified as Skein-512-512): short name:
+ *     <code>skein512</code> (64)
+ *
+ * For the second-round SHA-3 candidates, the functions are as specified
+ * for round 2, i.e. with the "tweaks" that some candidates added
+ * between round 1 and round 2. Also, some of the submitted packages for
+ * round 2 contained errors, in the specification, reference code, or
+ * both. <code>sphlib</code> implements the corrected versions.
+ */
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 32 bits; on most
+ * architectures, it will have a width of exactly 32 bits. Unsigned C
+ * types implement arithmetics modulo a power of 2; use the
+ * <code>SPH_T32()</code> macro to ensure that the value is truncated
+ * to exactly 32 bits. Unless otherwise specified, all macros and
+ * functions which accept <code>sph_u32</code> values assume that these
+ * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
+ * where <code>sph_u32</code> is larger than that.
+ */
+typedef __arch_dependant__ sph_u32;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u32</code>; it has
+ * width 32 bits or more.
+ */
+typedef __arch_dependant__ sph_s32;
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 64 bits; on most
+ * architectures which feature such a type, it will have a width of
+ * exactly 64 bits. C99-compliant platform will have this type; it
+ * is also defined when the GNU compiler (gcc) is used, and on
+ * platforms where <code>unsigned long</code> is large enough. If this
+ * type is not available, then some hash functions which depends on
+ * a 64-bit type will not be available (most notably SHA-384, SHA-512,
+ * Tiger and WHIRLPOOL).
+ */
+typedef __arch_dependant__ sph_u64;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u64</code>; it has
+ * width 64 bits or more.
+ */
+typedef __arch_dependant__ sph_s64;
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u32</code>. Depending on
+ * how this type is defined, a suffix such as <code>UL</code> may
+ * be appended to the argument.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C32(x)
+
+/**
+ * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler.
+ *
+ * @param x   the value to truncate (of type <code>sph_u32</code>)
+ */
+#define SPH_T32(x)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTL32(x, n)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTR32(x, n)
+
+/**
+ * This macro is defined on systems for which a 64-bit type has been
+ * detected, and is used for <code>sph_u64</code>.
+ */
+#define SPH_64
+
+/**
+ * This macro is defined on systems for the "native" integer size is
+ * 64 bits (64-bit values fit in one register).
+ */
+#define SPH_64_TRUE
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u64</code>. Depending on
+ * how this type is defined, a suffix such as <code>ULL</code> may
+ * be appended to the argument. This macro is defined only if a
+ * 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C64(x)
+
+/**
+ * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler. This macro is defined only
+ * if a 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to truncate (of type <code>sph_u64</code>)
+ */
+#define SPH_T64(x)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTL64(x, n)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTR64(x, n)
+
+/**
+ * This macro evaluates to <code>inline</code> or an equivalent construction,
+ * if available on the compilation platform, or to nothing otherwise. This
+ * is used to declare inline functions, for which the compiler should
+ * endeavour to include the code directly in the caller. Inline functions
+ * are typically defined in header files as replacement for macros.
+ */
+#define SPH_INLINE
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * little-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_LITTLE_ENDIAN
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * big-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_BIG_ENDIAN
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in little-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the big-endian platforms which have special little-endian access
+ * opcodes (e.g. Ultrasparc).
+ */
+#define SPH_LITTLE_FAST
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in big-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the little-endian platforms which have special big-endian access
+ * opcodes.
+ */
+#define SPH_BIG_FAST
+
+/**
+ * On some platforms, this macro is defined to an unsigned integer type
+ * into which pointer values may be cast. The resulting value can then
+ * be tested for being a multiple of 2, 4 or 8, indicating an aligned
+ * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
+ */
+#define SPH_UPTR
+
+/**
+ * When defined, this macro indicates that unaligned memory accesses
+ * are possible with only a minor penalty, and thus should be prefered
+ * over strategies which first copy data to an aligned buffer.
+ */
+#define SPH_UNALIGNED
+
+/**
+ * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
+ * <code>0x78563412</code>). This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance.
+ *
+ * @param x   the 32-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u32 sph_bswap32(sph_u32 x);
+
+/**
+ * Byte-swap a 64-bit word. This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance. This
+ * function is defined only if a suitable 64-bit type was found for
+ * <code>sph_u64</code>
+ *
+ * @param x   the 64-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u64 sph_bswap64(sph_u64 x);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16le(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16le(void *dst, unsigned val);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16be(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16be(void *dst, unsigned val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32le()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32le()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32be()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32be()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64le()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64le()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64be()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64be()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
+
+#endif
+
+/* ============== END documentation block for Doxygen ============= */
+
+#ifndef DOXYGEN_IGNORE
+
+/*
+ * We want to define the types "sph_u32" and "sph_u64" which hold
+ * unsigned values of at least, respectively, 32 and 64 bits. These
+ * tests should select appropriate types for most platforms. The
+ * macro "SPH_64" is defined if the 64-bit is supported.
+ */
+
+#undef SPH_64
+#undef SPH_64_TRUE
+
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+
+/*
+ * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
+ * type, if any, or otherwise use a wider type (which must exist, for
+ * C99 conformance).
+ */
+
+#include <stdint.h>
+
+#ifdef UINT32_MAX
+typedef uint32_t sph_u32;
+typedef int32_t sph_s32;
+#else
+typedef uint_fast32_t sph_u32;
+typedef int_fast32_t sph_s32;
+#endif
+#if !SPH_NO_64
+#ifdef UINT64_MAX
+typedef uint64_t sph_u64;
+typedef int64_t sph_s64;
+#else
+typedef uint_fast64_t sph_u64;
+typedef int_fast64_t sph_s64;
+#endif
+#endif
+
+#define SPH_C32(x)    ((sph_u32)(x))
+#if !SPH_NO_64
+#define SPH_C64(x)    ((sph_u64)(x))
+#define SPH_64  1
+#endif
+
+#else
+
+/*
+ * On non-C99 systems, we use "unsigned int" if it is wide enough,
+ * "unsigned long" otherwise. This supports all "reasonable" architectures.
+ * We have to be cautious: pre-C99 preprocessors handle constants
+ * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
+ */
+
+#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+
+#else
+
+typedef unsigned long sph_u32;
+typedef long sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## UL))
+
+#endif
+
+#if !SPH_NO_64
+
+/*
+ * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
+ * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
+ * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
+ * test whether "unsigned long long" is available; we also know that
+ * gcc features this type, even if the libc header do not know it.
+ */
+
+#if ((ULONG_MAX >> 31) >> 31) >= 3
+
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+
+#define SPH_64  1
+
+#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
+
+typedef unsigned long long sph_u64;
+typedef long long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## ULL))
+
+#define SPH_64  1
+
+#else
+
+/*
+ * No 64-bit type...
+ */
+
+#endif
+
+#endif
+
+#endif
+
+/*
+ * If the "unsigned long" type has length 64 bits or more, then this is
+ * a "true" 64-bit architectures. This is also true with Visual C on
+ * amd64, even though the "long" type is limited to 32 bits.
+ */
+#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
+#define SPH_64_TRUE   1
+#endif
+
+/*
+ * Implementation note: some processors have specific opcodes to perform
+ * a rotation. Recent versions of gcc recognize the expression above and
+ * use the relevant opcodes, when appropriate.
+ */
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#if SPH_64
+
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
+
+#endif
+
+#ifndef DOXYGEN_IGNORE
+/*
+ * Define SPH_INLINE to be an "inline" qualifier, if available. We define
+ * some small macro-like functions which benefit greatly from being inlined.
+ */
+#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
+#define SPH_INLINE inline
+#elif defined _MSC_VER
+#define SPH_INLINE __inline
+#else
+#define SPH_INLINE
+#endif
+#endif
+
+/*
+ * We define some macros which qualify the architecture. These macros
+ * may be explicit set externally (e.g. as compiler parameters). The
+ * code below sets those macros if they are not already defined.
+ *
+ * Most macros are boolean, thus evaluate to either zero or non-zero.
+ * The SPH_UPTR macro is special, in that it evaluates to a C type,
+ * or is not defined.
+ *
+ * SPH_UPTR             if defined: unsigned type to cast pointers into
+ *
+ * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
+ * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
+ * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
+ * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
+ * SPH_BIG_FAST         non-zero if big-endian decoding is fast
+ *
+ * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
+ * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
+ * _must_ be non-zero in those situations. The 32-bit and 64-bit types
+ * _must_ also have an exact width.
+ *
+ * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
+ * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
+ * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
+ * SPH_I386_GCC         x86-compatible (32-bit) with gcc
+ * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
+ * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
+ * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
+ * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
+ * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
+ *
+ * TODO: enhance automatic detection, for more architectures and compilers.
+ * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
+ * some very fast functions (e.g. MD4) when using unaligned input data.
+ * The CPU-specific-with-GCC macros are useful only for inline assembly,
+ * normally restrained to this header file.
+ */
+
+/*
+ * 32-bit x86, aka "i386 compatible".
+ */
+#if defined __i386__ || defined _M_IX86
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u32
+#ifdef __GNUC__
+#define SPH_DETECT_I386_GCC          1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_I386_MSVC         1
+#endif
+
+/*
+ * 64-bit x86, hereafter known as "amd64".
+ */
+#elif defined __x86_64 || defined _M_X64
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_AMD64_GCC         1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_AMD64_MSVC        1
+#endif
+
+/*
+ * 64-bit Sparc architecture (implies v9).
+ */
+#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
+	|| defined __sparcv9
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_SPARCV9_GCC_64    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * 32-bit Sparc.
+ */
+#elif (defined __sparc__ || defined __sparc) \
+	&& !(defined __sparcv9 || defined __arch64__)
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u32
+#if defined __GNUC__ && defined __sparc_v9__
+#define SPH_DETECT_SPARCV9_GCC_32    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * ARM, little-endian.
+ */
+#elif defined __arm__ && __ARMEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, little-endian.
+ */
+#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, big-endian.
+ */
+#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
+
+#define SPH_DETECT_BIG_ENDIAN        1
+
+/*
+ * PowerPC.
+ */
+#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
+	|| defined _ARCH_PPC
+
+/*
+ * Note: we do not declare cross-endian access to be "fast": even if
+ * using inline assembly, implementation should still assume that
+ * keeping the decoded word in a temporary is faster than decoding
+ * it again.
+ */
+#if defined __GNUC__
+#if SPH_64_TRUE
+#define SPH_DETECT_PPC64_GCC         1
+#else
+#define SPH_DETECT_PPC32_GCC         1
+#endif
+#endif
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+
+/*
+ * Itanium, 64-bit.
+ */
+#elif defined __ia64 || defined __ia64__ \
+	|| defined __itanium__ || defined _M_IA64
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#else
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+#if defined __LP64__ || defined _LP64
+#define SPH_DETECT_UPTR              sph_u64
+#else
+#define SPH_DETECT_UPTR              sph_u32
+#endif
+
+#endif
+
+#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
+#define SPH_DETECT_SPARCV9_GCC       1
+#endif
+
+#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
+#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
+#endif
+#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
+#define SPH_UPTR              SPH_DETECT_UPTR
+#endif
+#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
+#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
+#endif
+#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
+#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
+#endif
+#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
+#endif
+#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
+#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
+#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
+#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
+#endif
+#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
+#define SPH_I386_GCC          SPH_DETECT_I386_GCC
+#endif
+#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
+#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
+#endif
+#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
+#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
+#endif
+#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
+#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
+#endif
+#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
+#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
+#endif
+#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
+#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
+#endif
+
+#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST              1
+#endif
+#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST                 1
+#endif
+
+#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
+#error SPH_UPTR defined, but endianness is not known.
+#endif
+
+#if SPH_I386_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+#elif SPH_AMD64_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * and 64-bit values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#endif
+
+/*
+ * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
+ * to generate proper opcodes for endianness swapping with the pure C
+ * implementation below.
+ *
+
+#elif SPH_I386_MSVC && !SPH_NO_ASM
+
+static __inline sph_u32 __declspec(naked) __fastcall
+sph_bswap32(sph_u32 x)
+{
+	__asm {
+		bswap  ecx
+		mov    eax,ecx
+		ret
+	}
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+ *
+ * [end of disabled code]
+ */
+
+#else
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	x = SPH_T32((x << 16) | (x >> 16));
+	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
+		| ((x & SPH_C32(0x00FF00FF)) << 8);
+	return x;
+}
+
+#if SPH_64
+
+/**
+ * Byte-swap a 64-bit value.
+ *
+ * @param x   the input value
+ * @return  the byte-swapped value
+ */
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	x = SPH_T64((x << 32) | (x >> 32));
+	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
+		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
+	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
+		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
+	return x;
+}
+
+#endif
+
+#endif
+
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+
+/*
+ * On UltraSPARC systems, native ordering is big-endian, but it is
+ * possible to perform little-endian read accesses by specifying the
+ * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
+ * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
+ * contains the source address and %dst is the destination register,
+ * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
+ * to get the address space name. The latter format is better since it
+ * combines an addition and the actual access in a single opcode; but
+ * it requires the setting (and subsequent resetting) of %asi, which is
+ * slow. Some operations (i.e. MD5 compression function) combine many
+ * successive little-endian read accesses, which may share the same
+ * %asi setting. The macros below contain the appropriate inline
+ * assembly.
+ */
+
+#define SPH_SPARCV9_SET_ASI   \
+	sph_u32 sph_sparcv9_asi; \
+	__asm__ __volatile__ ( \
+		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_RESET_ASI  \
+	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
+		sph_u32 sph_sparcv9_tmp; \
+		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
+			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
+		sph_sparcv9_tmp; \
+	})
+
+#endif
+
+static SPH_INLINE void
+sph_enc16be(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = (val >> 8);
+	((unsigned char *)dst)[1] = val;
+}
+
+static SPH_INLINE unsigned
+sph_dec16be(const void *src)
+{
+	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
+		| (unsigned)(((const unsigned char *)src)[1]);
+}
+
+static SPH_INLINE void
+sph_enc16le(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = val >> 8;
+}
+
+static SPH_INLINE unsigned
+sph_dec16le(const void *src)
+{
+	return (unsigned)(((const unsigned char *)src)[0])
+		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32be(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 24);
+		((unsigned char *)dst)[1] = (val >> 16);
+		((unsigned char *)dst)[2] = (val >> 8);
+		((unsigned char *)dst)[3] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32be_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap32(*(const sph_u32 *)src);
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+			| (sph_u32)(((const unsigned char *)src)[3]);
+	}
+#endif
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u32 *)src;
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32le(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32le_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		/*
+		 * "__volatile__" is needed here because without it,
+		 * gcc-3.4.3 miscompiles the code and performs the
+		 * access before the test on the address, thus triggering
+		 * a bus error...
+		 */
+		__asm__ __volatile__ (
+			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * On PowerPC, this turns out not to be worth the effort: the inline
+ * assembly makes GCC optimizer uncomfortable, which tends to nullify
+ * the decoding gains.
+ *
+ * For most hash functions, using this inline assembly trick changes
+ * hashing speed by less than 5% and often _reduces_ it. The biggest
+ * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
+ * less then 10%. The speed gain on CubeHash is probably due to the
+ * chronic shortage of registers that CubeHash endures; for the other
+ * functions, the generic code appears to be efficient enough already.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		__asm__ __volatile__ (
+			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return (sph_u32)(((const unsigned char *)src)[0])
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+	}
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u32 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+#if SPH_64
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64be(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 56);
+		((unsigned char *)dst)[1] = (val >> 48);
+		((unsigned char *)dst)[2] = (val >> 40);
+		((unsigned char *)dst)[3] = (val >> 32);
+		((unsigned char *)dst)[4] = (val >> 24);
+		((unsigned char *)dst)[5] = (val >> 16);
+		((unsigned char *)dst)[6] = (val >> 8);
+		((unsigned char *)dst)[7] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64be_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap64(*(const sph_u64 *)src);
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+			| (sph_u64)(((const unsigned char *)src)[7]);
+	}
+#endif
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u64 *)src;
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64le(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+		((unsigned char *)dst)[4] = (val >> 32);
+		((unsigned char *)dst)[5] = (val >> 40);
+		((unsigned char *)dst)[6] = (val >> 48);
+		((unsigned char *)dst)[7] = (val >> 56);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64le_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+		return (sph_u64)sph_dec32le_aligned(src)
+			| ((sph_u64)sph_dec32le_aligned(
+				(const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return (sph_u64)(((const unsigned char *)src)[0])
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+	}
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u64 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+	return (sph_u64)sph_dec32le_aligned(src)
+		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+#endif
+
+#endif /* Doxygen excluded block */
+
+#endif