add x17 algo, cleaned djm34 commit

todo: visual studio...
2025-01-22 04:24:29 +00:00 · 2014-08-23 18:18:24 +02:00 · 2014-08-23 18:18:24 +02:00 · 64e8cd3f98
commit 64e8cd3f98
parent 5d6a489b98
22 changed files with 3806 additions and 61 deletions
--- a/JHA/.deps/.dirstamp
+++ b/JHA/.deps/.dirstamp
--- a/JHA/.dirstamp
+++ b/JHA/.dirstamp
--- a/Makefile.am
+++ b/Makefile.am
@ -36,11 +36,12 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
-			  sph/shabal.c sph/whirlpool.c \
+			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
-			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlcoin.cu
+			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlcoin.cu \
+			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu

 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
@ -68,6 +69,9 @@ x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $<

+x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
+	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
+
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -252,6 +252,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
    <ClCompile Include="sph\jh.c" />
    <ClCompile Include="sph\keccak.c" />
    <ClCompile Include="sph\luffa.c" />
+    <ClCompile Include="sph\md_helper.c">
+      <ExcludedFromBuild>true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="sph\sha2.c" />
    <ClCompile Include="sph\shabal.c" />
    <ClCompile Include="sph\shavite.c" />
@ -260,9 +263,6 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
    <ClCompile Include="sph\hamsi.c" />
    <ClCompile Include="sph\hamsi_helper.c" />
    <ClCompile Include="sph\whirlpool.c" />
-    <ClCompile Include="sph\x15_helper.c">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-    </ClCompile>
    <ClCompile Include="util.c">
      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -135,6 +135,9 @@
    <ClCompile Include="sph\luffa.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
+    <ClCompile Include="sph\md_helper.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
    <ClCompile Include="sph\sha2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
@ -162,9 +165,6 @@
    <ClCompile Include="sph\whirlpool.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
-    <ClCompile Include="sph\x15_helper.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="compat.h">
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -140,6 +140,7 @@ typedef enum {
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
+	ALGO_X17,
 	ALGO_DMD_GR,
 } sha256_algos;

@ -159,6 +160,7 @@ static const char *algo_names[] = {
 	"x13",
 	"x14",
 	"x15",
+	"x17",
 	"dmd-gr",
 };

@ -238,6 +240,7 @@ Options:\n\
                        x13       X13 (MaruCoin) hash\n\
                        x14       X14 hash\n\
                        x15       X15 hash\n\
+                        x17       X17 (peoplecurrency) hash\n\
                        dmd-gr    Diamond-Groestl hash\n\
  -d, --devices         takes a comma separated list of CUDA devices to use.\n\
                        Device IDs start counting from 0! Alternatively takes\n\
@ -966,6 +969,11 @@ static void *miner_thread(void *userdata)
 				max_nonce, &hashes_done);
 			break;

+		case ALGO_X17:
+			rc = scanhash_x17(thr_id, work.data, work.target,
+				max_nonce, &hashes_done);
+			break;
+
 		default:
 			/* should never happen */
 			goto out;
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -148,12 +148,10 @@ __device__ __forceinline__
 uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
-	asm("{"
-		".reg .u64 lt;\n\t"
-		"xor.b64 lt, %2, %3;\n\t"
-		"xor.b64 %0, %1, lt;\n\t"
-		"}"
-	: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
+	asm("xor.b64 %0, %2, %3;\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }
 #else
@ -179,59 +177,56 @@ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t
 #define xor8(a,b,c,d,e,f,g,h) (a^b^c^d^e^f^g^h)
 #endif

-// device asm for whirpool
+// device asm for x17
 __device__ __forceinline__
 uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
 	asm("{\n\t"
-		".reg .u64 m,n;\n\t"
-		"xor.b64 m, %2,%3;\n\t"
-		"and.b64 n, m,%1;\n\t"
-		"xor.b64 %0, n,%3;\n\t"
-		"}\n\t"
+		".reg .u64 n;\n\t"
+		"xor.b64 %0, %2, %3;\n\t"
+		"and.b64 n, %0, %1;\n\t"
+		"xor.b64 %0, n, %3;"
+	"}\n"
 	: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }

-// device asm for whirpool
+// device asm for x17
 __device__ __forceinline__
 uint64_t sph_t64(uint64_t x)
 {
 	uint64_t result;
 	asm("{\n\t"
 		"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
-		"}\n\t"
+	"}\n"
 	: "=l"(result) : "l"(x));
 	return result;
 }

-// device asm for ?
+// device asm for x17
 __device__ __forceinline__
 uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
 	asm("{\n\t"
-		".reg .u64 m,n,o;\n\t"
+		".reg .u64 m,n;\n\t"
 		"and.b64 m,  %1, %2;\n\t"
 		" or.b64 n,  %1, %2;\n\t"
-		"and.b64 o,   n, %3;\n\t"
-		" or.b64 %0,  m, o ;\n\t"
-		"}\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m ;\n\t"
+	"}\n"
 	: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }

-// device asm for ?
+// device asm for x17
 __device__ __forceinline__
 uint64_t shr_t64(uint64_t x, uint32_t n)
 {
 	uint64_t result;
-	asm("{\n\t"
-		".reg .u64 m;\n\t"
-		"shr.b64 m,%1,%2;\n\t"
-		"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
-		"}\n\t"
+	asm("shr.b64 %0,%1,%2;\n\t"
+		"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
 	: "=l"(result) : "l"(x), "r"(n));
 	return result;
 }
@ -241,11 +236,8 @@ __device__ __forceinline__
 uint64_t shl_t64(uint64_t x, uint32_t n)
 {
 	uint64_t result;
-	asm("{\n\t"
-		".reg .u64 m;\n\t"
-		"shl.b64 m,%1,%2;\n\t"
-		"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
-		"}\n\t"
+	asm("shl.b64 %0,%1,%2;\n\t"
+		"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
 	: "=l"(result) : "l"(x), "r"(n));
 	return result;
 }
@ -272,13 +264,13 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 {
 	uint64_t result;
 	asm("{\n\t"
-		".reg .b64 lhs, rhs;\n\t"
-		".reg .u32 amt2;\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
 		"shr.b64 lhs, %1, %2;\n\t"
-		"sub.u32 amt2, 64, %2;\n\t"
-		"shl.b64 rhs, %1, amt2;\n\t"
-		"add.u64 %0, lhs, rhs;\n\t"
-		"}\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shl.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, %0, lhs;\n\t"
+	"}\n"
 	: "=l"(result) : "l"(x), "r"(offset));
 	return result;
 }
@ -307,13 +299,13 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 {
 	uint64_t result;
 	asm("{\n\t"
-		".reg .b64 lhs, rhs;\n\t"
-		".reg .u32 amt2;\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
 		"shl.b64 lhs, %1, %2;\n\t"
-		"sub.u32 amt2, 64, %2;\n\t"
-		"shr.b64 rhs, %1, amt2;\n\t"
-		"add.u64 %0, lhs, rhs;\n\t"
-		"}\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shr.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, lhs, %0;\n\t"
+	"}\n"
 	: "=l"(result) : "l"(x), "r"(offset));
 	return result;
 }
--- a/miner.h
+++ b/miner.h
@ -265,6 +265,10 @@ extern int scanhash_x15(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_x17(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 struct thr_info {
 	int		id;
 	pthread_t	pth;
@ -402,6 +406,7 @@ void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
+void x17hash(void *output, const void *input);

 #ifdef __cplusplus
 }
--- a/quark/.deps/.dirstamp
+++ b/quark/.deps/.dirstamp
--- a/quark/.dirstamp
+++ b/quark/.dirstamp
--- a/sph/haval.c
+++ b/sph/haval.c
@ -0,0 +1,983 @@
+/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * HAVAL implementation.
+ *
+ * The HAVAL reference paper is of questionable clarity with regards to
+ * some details such as endianness of bits within a byte, bytes within
+ * a 32-bit word, or the actual ordering of words within a stream of
+ * words. This implementation has been made compatible with the reference
+ * implementation available on: http://labs.calyptix.com/haval.php
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_haval.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
+#define SPH_SMALL_FOOTPRINT_HAVAL   1
+#endif
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ ((x0) & (x1)) ^ (x0))
+ *
+ */
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x1) & (x2)) \
+	^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x5)) \
+	^ ((x4) & (x5)) ^ ((x0) & (x2)) ^ (x0))
+ *
+ */
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
+	^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x1) & (x4)) ^ ((x2) & (x5)) \
+	^ ((x3) & (x6)) ^ ((x0) & (x3)) ^ (x0))
+ *
+ */
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x3) & (x4) & (x6)) \
+	^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x4)) ^ ((x3) & (x5)) \
+	^ ((x3) & (x6)) ^ ((x4) & (x5)) ^ ((x4) & (x6)) ^ ((x0) & (x4)) ^ (x0))
+ *
+ */
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
+	^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
+	^ ((x2) & (x6)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) \
+	^ ((x0) & (x1) & (x2) & (x3)) ^ ((x0) & (x5)) ^ (x0))
+ *
+ */
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
+
+/*
+ * The macros below integrate the phi() permutations, depending on the
+ * pass and the total number of passes.
+ */
+
+#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+/*
+ * One step, for "n" passes, pass number "p" (1 <= p <= n), using
+ * input word number "w" and step constant "c".
+ */
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c)  do { \
+		sph_u32 t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+		(x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	} while (0)
+
+/*
+ * PASSy(n, in) computes pass number "y", for a total of "n", using the
+ * one-argument macro "in" to access input words. Current state is assumed
+ * to be held in variables "s0" to "s7".
+ */
+
+#if SPH_SMALL_FOOTPRINT_HAVAL
+
+#define PASS1(n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0), SPH_C32(0x00000000)); \
+			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1), SPH_C32(0x00000000)); \
+			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2), SPH_C32(0x00000000)); \
+			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3), SPH_C32(0x00000000)); \
+			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4), SPH_C32(0x00000000)); \
+			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5), SPH_C32(0x00000000)); \
+			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6), SPH_C32(0x00000000)); \
+			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7), SPH_C32(0x00000000)); \
+   		} \
+	} while (0)
+
+#define PASSG(p, n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(MP ## p[pass_count + 0]), \
+				RK ## p[pass_count + 0]); \
+			STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(MP ## p[pass_count + 1]), \
+				RK ## p[pass_count + 1]); \
+			STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(MP ## p[pass_count + 2]), \
+				RK ## p[pass_count + 2]); \
+			STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(MP ## p[pass_count + 3]), \
+				RK ## p[pass_count + 3]); \
+			STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(MP ## p[pass_count + 4]), \
+				RK ## p[pass_count + 4]); \
+			STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(MP ## p[pass_count + 5]), \
+				RK ## p[pass_count + 5]); \
+			STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(MP ## p[pass_count + 6]), \
+				RK ## p[pass_count + 6]); \
+			STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(MP ## p[pass_count + 7]), \
+				RK ## p[pass_count + 7]); \
+   		} \
+	} while (0)
+
+#define PASS2(n, in)    PASSG(2, n, in)
+#define PASS3(n, in)    PASSG(3, n, in)
+#define PASS4(n, in)    PASSG(4, n, in)
+#define PASS5(n, in)    PASSG(5, n, in)
+
+static const unsigned MP2[32] = {
+	 5, 14, 26, 18, 11, 28,  7, 16,
+	 0, 23, 20, 22,  1, 10,  4,  8,
+	30,  3, 21,  9, 17, 24, 29,  6,
+	19, 12, 15, 13,  2, 25, 31, 27
+};
+
+static const unsigned MP3[32] = {
+	19,  9,  4, 20, 28, 17,  8, 22,
+	29, 14, 25, 12, 24, 30, 16, 26,
+	31, 15,  7,  3,  1,  0, 18, 27,
+	13,  6, 21, 10, 23, 11,  5,  2
+};
+
+static const unsigned MP4[32] = {
+	24,  4,  0, 14,  2,  7, 28, 23,
+	26,  6, 30, 20, 18, 25, 19,  3,
+	22, 11, 31, 21,  8, 27, 12,  9,
+	 1, 29,  5, 15, 17, 10, 16, 13
+};
+
+static const unsigned MP5[32] = {
+	27,  3, 21, 26, 17, 11, 20, 29,
+	19,  0, 12,  7, 13,  8, 31, 10,
+	 5,  9, 14, 30, 18,  6, 28, 24,
+	 2, 23, 16, 22,  4,  1, 25, 15
+};
+
+static const sph_u32 RK2[32] = {
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
+	SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
+	SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
+	SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
+	SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
+	SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
+	SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
+	SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
+	SPH_C32(0x636920D8), SPH_C32(0x71574E69),
+	SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
+	SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
+	SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
+	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
+};
+
+static const sph_u32 RK3[32] = {
+	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
+	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
+	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
+	SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
+	SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
+	SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
+	SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
+	SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
+	SPH_C32(0x57489862), SPH_C32(0x63E81440),
+	SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
+	SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
+	SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
+	SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
+	SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
+	SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
+	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
+};
+
+static const sph_u32 RK4[32] = {
+	SPH_C32(0x7A325381), SPH_C32(0x28958677),
+	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
+	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
+	SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
+	SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
+	SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
+	SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
+	SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
+	SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
+	SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
+	SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
+	SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
+	SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
+	SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
+	SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
+	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
+};
+
+static const sph_u32 RK5[32] = {
+	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
+	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
+	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
+	SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
+	SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
+	SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
+	SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
+	SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
+	SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
+	SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
+	SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
+	SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
+	SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
+	SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
+	SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
+	SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
+};
+
+#else
+
+#define PASS1(n, in)   do { \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 1), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in( 2), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in( 5), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in( 6), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in( 7), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 8), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(10), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(11), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(12), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(13), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(14), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(16), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(17), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(18), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(19), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(20), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(21), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(22), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(25), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(27), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(29), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(30), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(31), SPH_C32(0x00000000)); \
+	} while (0)
+
+#define PASS2(n, in)   do { \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x452821E6)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0x38D01377)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0xBE5466CF)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(18), SPH_C32(0x34E90C6C)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(11), SPH_C32(0xC0AC29B7)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(28), SPH_C32(0xC97C50DD)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 7), SPH_C32(0x3F84D5B5)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(16), SPH_C32(0xB5470917)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x9216D5D9)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0x8979FB1B)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(20), SPH_C32(0xD1310BA6)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0x98DFB5AC)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0x2FFD72DB)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xD01ADFB7)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 4), SPH_C32(0xB8E1AFED)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 8), SPH_C32(0x6A267E96)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(30), SPH_C32(0xBA7C9045)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0xF12C7F99)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x24A19947)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in( 9), SPH_C32(0xB3916CF7)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x0801F2E2)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(24), SPH_C32(0x858EFC16)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(29), SPH_C32(0x636920D8)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 6), SPH_C32(0x71574E69)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0xA458FEA3)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(12), SPH_C32(0xF4933D7E)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(15), SPH_C32(0x0D95748F)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(13), SPH_C32(0x728EB658)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0x718BCD58)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0x82154AEE)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x7B54A41D)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0xC25A59B5)); \
+	} while (0)
+
+#define PASS3(n, in)   do { \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x9C30D539)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x2AF26013)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 4), SPH_C32(0xC5D1B023)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0x286085F0)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0xCA417918)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(17), SPH_C32(0xB8DB38EF)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 8), SPH_C32(0x8E79DCB0)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(22), SPH_C32(0x603A180E)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(29), SPH_C32(0x6C9E0E8B)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0xB01E8A3E)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(25), SPH_C32(0xD71577C1)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(12), SPH_C32(0xBD314B27)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(24), SPH_C32(0x78AF2FDA)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(30), SPH_C32(0x55605C60)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0xE65525F3)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(26), SPH_C32(0xAA55AB94)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(31), SPH_C32(0x57489862)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(15), SPH_C32(0x63E81440)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 7), SPH_C32(0x55CA396A)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x2AAB10B6)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0xB4CC5C34)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in( 0), SPH_C32(0x1141E8CE)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(18), SPH_C32(0xA15486AF)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0x7C72E993)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(13), SPH_C32(0xB3EE1411)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x636FBC2A)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x2BA9C55D)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(10), SPH_C32(0x741831F6)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(23), SPH_C32(0xCE5C3E16)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x9B87931E)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 5), SPH_C32(0xAFD6BA33)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in( 2), SPH_C32(0x6C24CF5C)); \
+	} while (0)
+
+#define PASS4(n, in)   do { \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x7A325381)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 4), SPH_C32(0x28958677)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 0), SPH_C32(0x3B8F4898)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(14), SPH_C32(0x6B4BB9AF)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0xC4BFE81B)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in( 7), SPH_C32(0x66282193)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x61D809CC)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0xFB21A991)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(26), SPH_C32(0x487CAC60)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x5DEC8032)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(30), SPH_C32(0xEF845D5D)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0xE98575B1)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDC262302)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0xEB651B88)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(19), SPH_C32(0x23893E81)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 3), SPH_C32(0xD396ACC5)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(22), SPH_C32(0x0F6D6FF3)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(11), SPH_C32(0x83F44239)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(31), SPH_C32(0x2E0B4482)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(21), SPH_C32(0xA4842004)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 8), SPH_C32(0x69C8F04A)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(27), SPH_C32(0x9E1F9B5E)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(12), SPH_C32(0x21C66842)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 9), SPH_C32(0xF6E96C9A)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in( 1), SPH_C32(0x670C9C61)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(29), SPH_C32(0xABD388F0)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 5), SPH_C32(0x6A51A0D2)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(15), SPH_C32(0xD8542F68)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x960FA728)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xAB5133A3)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0x6EEF0B6C)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(13), SPH_C32(0x137A3BE4)); \
+	} while (0)
+
+#define PASS5(n, in)   do { \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(27), SPH_C32(0xBA3BF050)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0x7EFB2A98)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0xA1F1651D)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(26), SPH_C32(0x39AF0176)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x66CA593E)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x82430E88)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(20), SPH_C32(0x8CEE8619)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(29), SPH_C32(0x456F9FB4)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x7D84A5C3)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 0), SPH_C32(0x3B8B5EBE)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(12), SPH_C32(0xE06F75D8)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in( 7), SPH_C32(0x85C12073)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(13), SPH_C32(0x401A449F)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 8), SPH_C32(0x56C16AA6)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x4ED3AA62)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(10), SPH_C32(0x363F7706)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x1BFEDF72)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x429B023D)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(14), SPH_C32(0x37D0D724)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(30), SPH_C32(0xD00A1248)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDB0FEAD3)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 6), SPH_C32(0x49F1C09B)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x075372C9)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(24), SPH_C32(0x80991B7B)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 2), SPH_C32(0x25D479D8)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0xF6E8DEF7)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(16), SPH_C32(0xE3FE501A)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0xB6794C3B)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x976CE0BD)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 1), SPH_C32(0x04C006BA)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(25), SPH_C32(0xC1A94FB6)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x409F60C4)); \
+	} while (0)
+
+#endif
+
+#define SAVE_STATE \
+	sph_u32 u0, u1, u2, u3, u4, u5, u6, u7; \
+	do { \
+		u0 = s0; \
+		u1 = s1; \
+		u2 = s2; \
+		u3 = s3; \
+		u4 = s4; \
+		u5 = s5; \
+		u6 = s6; \
+		u7 = s7; \
+	} while (0)
+
+#define UPDATE_STATE   do { \
+		s0 = SPH_T32(s0 + u0); \
+		s1 = SPH_T32(s1 + u1); \
+		s2 = SPH_T32(s2 + u2); \
+		s3 = SPH_T32(s3 + u3); \
+		s4 = SPH_T32(s4 + u4); \
+		s5 = SPH_T32(s5 + u5); \
+		s6 = SPH_T32(s6 + u6); \
+		s7 = SPH_T32(s7 + u7); \
+	} while (0)
+
+/*
+ * COREn(in) performs the core HAVAL computation for "n" passes, using
+ * the one-argument macro "in" to access the input words. Running state
+ * is held in variable "s0" to "s7".
+ */
+
+#define CORE3(in)  do { \
+		SAVE_STATE; \
+		PASS1(3, in); \
+		PASS2(3, in); \
+		PASS3(3, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE4(in)  do { \
+		SAVE_STATE; \
+		PASS1(4, in); \
+		PASS2(4, in); \
+		PASS3(4, in); \
+		PASS4(4, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE5(in)  do { \
+		SAVE_STATE; \
+		PASS1(5, in); \
+		PASS2(5, in); \
+		PASS3(5, in); \
+		PASS4(5, in); \
+		PASS5(5, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+/*
+ * DSTATE declares the state variables "s0" to "s7".
+ */
+#define DSTATE   sph_u32 s0, s1, s2, s3, s4, s5, s6, s7
+
+/*
+ * RSTATE fills the state variables from the context "sc".
+ */
+#define RSTATE   do { \
+		s0 = sc->s0; \
+		s1 = sc->s1; \
+		s2 = sc->s2; \
+		s3 = sc->s3; \
+		s4 = sc->s4; \
+		s5 = sc->s5; \
+		s6 = sc->s6; \
+		s7 = sc->s7; \
+	} while (0)
+
+/*
+ * WSTATE updates the context "sc" from the state variables.
+ */
+#define WSTATE   do { \
+		sc->s0 = s0; \
+		sc->s1 = s1; \
+		sc->s2 = s2; \
+		sc->s3 = s3; \
+		sc->s4 = s4; \
+		sc->s5 = s5; \
+		sc->s6 = s6; \
+		sc->s7 = s7; \
+	} while (0)
+
+/*
+ * Initialize a context. "olen" is the output length, in 32-bit words
+ * (between 4 and 8, inclusive). "passes" is the number of passes
+ * (3, 4 or 5).
+ */
+static void
+haval_init(sph_haval_context *sc, unsigned olen, unsigned passes)
+{
+	sc->s0 = SPH_C32(0x243F6A88);
+	sc->s1 = SPH_C32(0x85A308D3);
+	sc->s2 = SPH_C32(0x13198A2E);
+	sc->s3 = SPH_C32(0x03707344);
+	sc->s4 = SPH_C32(0xA4093822);
+	sc->s5 = SPH_C32(0x299F31D0);
+	sc->s6 = SPH_C32(0x082EFA98);
+	sc->s7 = SPH_C32(0xEC4E6C89);
+	sc->olen = olen;
+	sc->passes = passes;
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+	
+}
+
+/*
+ * IN_PREPARE(data) contains declarations and code to prepare for
+ * reading input words pointed to by "data".
+ * INW(i) reads the word number "i" (from 0 to 31).
+ */
+#if SPH_LITTLE_FAST
+#define IN_PREPARE(indata)   const unsigned char *const load_ptr = \
+                             (const unsigned char *)(indata)
+#define INW(i)   sph_dec32le_aligned(load_ptr + 4 * (i))
+#else
+#define IN_PREPARE(indata) \
+	sph_u32 X_var[32]; \
+	int load_index; \
+ \
+	for (load_index = 0; load_index < 32; load_index ++) \
+		X_var[load_index] = sph_dec32le_aligned( \
+			(const unsigned char *)(indata) + 4 * load_index)
+#define INW(i)   X_var[i]
+#endif
+
+/*
+ * Mixing operation used for 128-bit output tailoring. This function
+ * takes the byte 0 from a0, byte 1 from a1, byte 2 from a2 and byte 3
+ * from a3, and combines them into a 32-bit word, which is then rotated
+ * to the left by n bits.
+ */
+static SPH_INLINE sph_u32
+mix128(sph_u32 a0, sph_u32 a1, sph_u32 a2, sph_u32 a3, int n)
+{
+	sph_u32 tmp;
+
+	tmp = (a0 & SPH_C32(0x000000FF))
+		| (a1 & SPH_C32(0x0000FF00))
+		| (a2 & SPH_C32(0x00FF0000))
+		| (a3 & SPH_C32(0xFF000000));
+	if (n > 0)
+		tmp = SPH_ROTL32(tmp, n);
+	return tmp;
+}
+
+/*
+ * Mixing operation used to compute output word 0 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_0(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x01F80000))
+		| (x6 & SPH_C32(0xFE000000))
+		| (x7 & SPH_C32(0x0000003F));
+	return SPH_ROTL32(tmp, 13);
+}
+
+/*
+ * Mixing operation used to compute output word 1 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_1(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0xFE000000))
+		| (x6 & SPH_C32(0x0000003F))
+		| (x7 & SPH_C32(0x00000FC0));
+	return SPH_ROTL32(tmp, 7);
+}
+
+/*
+ * Mixing operation used to compute output word 2 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_2(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x0000003F))
+		| (x6 & SPH_C32(0x00000FC0))
+		| (x7 & SPH_C32(0x0007F000));
+	return tmp;
+}
+
+/*
+ * Mixing operation used to compute output word 3 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_3(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x00000FC0))
+		| (x6 & SPH_C32(0x0007F000))
+		| (x7 & SPH_C32(0x01F80000));
+	return tmp >> 6;
+}
+
+/*
+ * Mixing operation used to compute output word 4 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_4(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x0007F000))
+		| (x6 & SPH_C32(0x01F80000))
+		| (x7 & SPH_C32(0xFE000000));
+	return tmp >> 12;
+}
+
+/*
+ * Mixing operation used to compute output word 0 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_0(sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x6 & SPH_C32(0xFC000000)) | (x7 & SPH_C32(0x0000001F));
+	return SPH_ROTL32(tmp, 6);
+}
+
+/*
+ * Mixing operation used to compute output word 1 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_1(sph_u32 x6, sph_u32 x7)
+{
+	return (x6 & SPH_C32(0x0000001F)) | (x7 & SPH_C32(0x000003E0));
+}
+
+/*
+ * Mixing operation used to compute output word 2 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_2(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x000003E0)) | (x7 & SPH_C32(0x0000FC00))) >> 5;
+}
+
+/*
+ * Mixing operation used to compute output word 3 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_3(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x0000FC00)) | (x7 & SPH_C32(0x001F0000))) >> 10;
+}
+
+/*
+ * Mixing operation used to compute output word 4 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_4(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x001F0000)) | (x7 & SPH_C32(0x03E00000))) >> 16;
+}
+
+/*
+ * Mixing operation used to compute output word 5 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_5(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x03E00000)) | (x7 & SPH_C32(0xFC000000))) >> 21;
+}
+
+/*
+ * Write out HAVAL output. The output length is tailored to the requested
+ * length.
+ */
+static void
+haval_out(sph_haval_context *sc, void *dst)
+{
+	DSTATE;
+	unsigned char *buf;
+
+	buf = dst;
+	RSTATE;
+	switch (sc->olen) {
+	case 4:
+		sph_enc32le(buf,      SPH_T32(s0 + mix128(s7, s4, s5, s6, 24)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix128(s6, s7, s4, s5, 16)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix128(s5, s6, s7, s4, 8)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix128(s4, s5, s6, s7, 0)));
+		break;
+	case 5:
+		sph_enc32le(buf,      SPH_T32(s0 + mix160_0(s5, s6, s7)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix160_1(s5, s6, s7)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix160_2(s5, s6, s7)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix160_3(s5, s6, s7)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + mix160_4(s5, s6, s7)));
+		break;
+	case 6:
+		sph_enc32le(buf,      SPH_T32(s0 + mix192_0(s6, s7)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix192_1(s6, s7)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix192_2(s6, s7)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix192_3(s6, s7)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + mix192_4(s6, s7)));
+		sph_enc32le(buf + 20, SPH_T32(s5 + mix192_5(s6, s7)));
+		break;
+	case 7:
+		sph_enc32le(buf,      SPH_T32(s0 + ((s7 >> 27) & 0x1F)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + ((s7 >> 22) & 0x1F)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + ((s7 >> 18) & 0x0F)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + ((s7 >> 13) & 0x1F)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + ((s7 >>  9) & 0x0F)));
+		sph_enc32le(buf + 20, SPH_T32(s5 + ((s7 >>  4) & 0x1F)));
+		sph_enc32le(buf + 24, SPH_T32(s6 + ((s7      ) & 0x0F)));
+		break;
+	case 8:
+		sph_enc32le(buf,      s0);
+		sph_enc32le(buf + 4,  s1);
+		sph_enc32le(buf + 8,  s2);
+		sph_enc32le(buf + 12, s3);
+		sph_enc32le(buf + 16, s4);
+		sph_enc32le(buf + 20, s5);
+		sph_enc32le(buf + 24, s6);
+		sph_enc32le(buf + 28, s7);
+		break;
+	}
+}
+
+/*
+ * The main core functions inline the code with the COREx() macros. We
+ * use a helper file, included three times, which avoids code copying.
+ */
+
+#undef PASSES
+#define PASSES   3
+#include "haval_helper.c"
+
+#undef PASSES
+#define PASSES   4
+#include "haval_helper.c"
+
+#undef PASSES
+#define PASSES   5
+#include "haval_helper.c"
+
+/* ====================================================================== */
+
+#define API(xxx, y) \
+void \
+sph_haval ## xxx ## _ ## y ## _init(void *cc) \
+{ \
+	haval_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y (void *cc, const void *data, size_t len) \
+{ \
+	haval ## y(cc, data, len); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y ## _close(void *cc, void *dst) \
+{ \
+	haval ## y ## _close(cc, 0, 0, dst); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y ## addbits_and_close( \
+	void *cc, unsigned ub, unsigned n, void *dst) \
+{ \
+	haval ## y ## _close(cc, ub, n, dst); \
+}
+
+API(128, 3)
+API(128, 4)
+API(128, 5)
+API(160, 3)
+API(160, 4)
+API(160, 5)
+API(192, 3)
+API(192, 4)
+API(192, 5)
+API(224, 3)
+API(224, 4)
+API(224, 5)
+API(256, 3)
+API(256, 4)
+API(256, 5)
+
+#define RVAL   do { \
+		s0 = val[0]; \
+		s1 = val[1]; \
+		s2 = val[2]; \
+		s3 = val[3]; \
+		s4 = val[4]; \
+		s5 = val[5]; \
+		s6 = val[6]; \
+		s7 = val[7]; \
+	} while (0)
+
+#define WVAL   do { \
+		val[0] = s0; \
+		val[1] = s1; \
+		val[2] = s2; \
+		val[3] = s3; \
+		val[4] = s4; \
+		val[5] = s5; \
+		val[6] = s6; \
+		val[7] = s7; \
+	} while (0)
+
+#define INMSG(i)   msg[i]
+
+/* see sph_haval.h */
+void
+sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE3(INMSG);
+	WVAL;
+}
+
+/* see sph_haval.h */
+void
+sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE4(INMSG);
+	WVAL;
+}
+
+/* see sph_haval.h */
+void
+sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE5(INMSG);
+	WVAL;
+}
+
+#ifdef __cplusplus
+}
+#endif	
--- a/sph/haval_helper.c
+++ b/sph/haval_helper.c
@ -0,0 +1,190 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+#ifdef SPH_UPTR
+SPH_XCAT(SPH_XCAT(haval, PASSES), _short)
+#else
+SPH_XCAT(haval, PASSES)
+#endif
+(sph_haval_context *sc, const void *data, size_t len)
+{
+	unsigned current;
+
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	while (len > 0) {
+		unsigned clen;
+#if !SPH_64
+		sph_u32 clow, clow2;
+#endif
+
+		clen = 128U - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		current += clen;
+		len -= clen;
+		if (current == 128U) {
+			DSTATE;
+			IN_PREPARE(sc->buf);
+			RSTATE;
+			SPH_XCAT(CORE, PASSES)(INW);
+			WSTATE;
+			current = 0;
+		}
+#if SPH_64
+		sc->count += clen;
+#else
+		clow = sc->count_low;
+		clow2 = SPH_T32(clow + clen);
+		sc->count_low = clow2;
+		if (clow2 < clow)
+			sc->count_high ++;
+#endif
+	}
+}
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(haval, PASSES)(sph_haval_context *sc, const void *data, size_t len)
+{
+	unsigned current;
+	size_t orig_len;
+#if !SPH_64
+	sph_u32 clow, clow2;
+#endif
+	DSTATE;
+
+	if (len < 256U) {
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len);
+		return;
+	}
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	if (current > 0) {
+		unsigned clen;
+		clen = 128U - current;
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & 3U) != 0) {
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len);
+		return;
+	}
+#endif
+	orig_len = len;
+	RSTATE;
+	while (len >= 128U) {
+		IN_PREPARE(data);
+		SPH_XCAT(CORE, PASSES)(INW);
+		data = (const unsigned char *)data + 128U;
+		len -= 128U;
+	}
+	WSTATE;
+	if (len > 0)
+		memcpy(sc->buf, data, len);
+#if SPH_64
+	sc->count += (sph_u64)orig_len;
+#else
+	clow = sc->count_low;
+	clow2 = SPH_T32(clow + orig_len);
+	sc->count_low = clow2;
+	if (clow2 < clow)
+		sc->count_high ++;
+	orig_len >>= 12;
+	orig_len >>= 10;
+	orig_len >>= 10;
+	sc->count_high += orig_len;
+#endif
+}
+#endif
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc,
+	unsigned ub, unsigned n, void *dst)
+{
+	unsigned current,j;
+	DSTATE;
+
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	sc->buf[current ++] = (0x01 << n) | ((ub & 0xFF) >> (8 - n));
+	RSTATE;
+	if (current > 118U) {
+		memset(sc->buf + current, 0, 128U - current);
+
+		do {
+			IN_PREPARE(sc->buf);
+			SPH_XCAT(CORE, PASSES)(INW);
+		} while (0);
+		current = 0;
+	}
+	memset(sc->buf + current, 0, 118U - current);
+	sc->buf[118] = 0x01 | (PASSES << 3);
+	sc->buf[119] = sc->olen << 3;
+#if SPH_64
+	sph_enc64le_aligned(sc->buf + 120, SPH_T64(sc->count << 3));
+#else
+	sph_enc32le_aligned(sc->buf + 120, SPH_T32(sc->count_low << 3));
+	sph_enc32le_aligned(sc->buf + 124,
+		SPH_T32((sc->count_high << 3) | (sc->count_low >> 29)));
+#endif
+
+	do {
+		IN_PREPARE(sc->buf);
+		SPH_XCAT(CORE, PASSES)(INW);
+	} while (0);
+	WSTATE;
+
+	haval_out(sc, dst);
+	haval_init(sc, sc->olen, sc->passes);
+}
--- a/sph/x15_helper.c
+++ b/sph/x15_helper.c
--- a/sph/sha2big.c
+++ b/sph/sha2big.c
@ -0,0 +1,256 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+
+#define ROTR64    SPH_ROTR64
+
+#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
+#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+/*
+ * This macro defines the body for a SHA-384 / SHA-512 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "r" parameter should evaluate to
+ * an array or pointer expression designating the array of 8 words which
+ * contains the input and output of the compression function.
+ *
+ * SHA-512 is hard for the compiler. If the loop is completely unrolled,
+ * then the code will be quite huge (possibly more than 100 kB), and the
+ * performance will be degraded due to cache misses on the code. We
+ * unroll only eight steps, which avoids all needless copies when
+ * 64-bit registers are swapped.
+ */
+
+#define SHA3_STEP(A, B, C, D, E, F, G, H, i)   do { \
+		sph_u64 T1, T2; \
+		T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
+		T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
+		D = SPH_T64(D + T1); \
+		H = SPH_T64(T1 + T2); \
+	} while (0)
+
+#define SHA3_ROUND_BODY(in, r)   do { \
+		int i; \
+		sph_u64 A, B, C, D, E, F, G, H; \
+		sph_u64 W[80]; \
+ \
+ 		for (i = 0; i < 16; i ++) \
+			W[i] = in(i); \
+			\
+		for (i = 16; i < 80; i ++) \
+ 			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+				+ SSG5_0(W[i - 15]) + W[i - 16]); \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		for (i = 0; i < 80; i += 8) { \
+			SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
+			SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
+			SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
+			SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
+			SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
+			SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
+			SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
+			SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
+		} \
+		(r)[0] = SPH_T64((r)[0] + A); \
+		(r)[1] = SPH_T64((r)[1] + B); \
+		(r)[2] = SPH_T64((r)[2] + C); \
+		(r)[3] = SPH_T64((r)[3] + D); \
+		(r)[4] = SPH_T64((r)[4] + E); \
+		(r)[5] = SPH_T64((r)[5] + F); \
+		(r)[6] = SPH_T64((r)[6] + G); \
+		(r)[7] = SPH_T64((r)[7] + H); \
+	} while (0)
+
+/*
+ * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
+ */
+static void
+sha3_round(const unsigned char *data, sph_u64 r[8])
+{
+#define SHA3_IN(x)   sph_dec64be_aligned(data + (8 * (x)))
+	SHA3_ROUND_BODY(SHA3_IN, r);
+#undef SHA3_IN
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_init(void *cc)
+{
+	sph_sha384_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H384, sizeof H384);
+	sc->count = 0;
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_init(void *cc)
+{
+	sph_sha512_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H512, sizeof H512);
+	sc->count = 0;
+}
+
+#define RFUN   sha3_round
+#define HASH   sha384
+#define BE64   1
+#include "md_helper.c"
+
+/* see sph_sha3.h */
+void
+sph_sha384_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 6);
+	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 6);
+	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 8);
+	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 8);
+	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
+{
+#define SHA3_IN(x)   msg[x]
+	SHA3_ROUND_BODY(SHA3_IN, val);
+#undef SHA3_IN
+}
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+
--- a/sph/sph_haval.h
+++ b/sph/sph_haval.h
@ -0,0 +1,976 @@
+/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
+/**
+ * HAVAL interface.
+ *
+ * HAVAL is actually a family of 15 hash functions, depending on whether
+ * the internal computation uses 3, 4 or 5 passes, and on the output
+ * length, which is 128, 160, 192, 224 or 256 bits. This implementation
+ * provides interface functions for all 15, which internally map to
+ * three cores (depending on the number of passes). Note that output
+ * lengths other than 256 bits are not obtained by a simple truncation
+ * of a longer result; the requested length is encoded within the
+ * padding data.
+ *
+ * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
+ * Seberry: "HAVAL -- a one-way hashing algorithm with variable length
+ * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
+ * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
+ *
+ * This paper, and a reference implementation, are available on the
+ * Calyptix web site: http://labs.calyptix.com/haval.php
+ *
+ * The HAVAL reference paper is quite unclear on the data encoding
+ * details, i.e. endianness (both byte order within a 32-bit word, and
+ * word order within a message block). This implementation has been
+ * made compatible with the reference implementation referenced above.
+ *
+ * @warning   A collision for HAVAL-128/3 (HAVAL with three passes and
+ * 128-bit output) has been published; this function is thus considered
+ * as cryptographically broken. The status for other variants is unclear;
+ * use only with care.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_haval.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_HAVAL_H__
+#define SPH_HAVAL_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for HAVAL-128/3.
+ */
+#define SPH_SIZE_haval128_3   128
+
+/**
+ * Output size (in bits) for HAVAL-128/4.
+ */
+#define SPH_SIZE_haval128_4   128
+
+/**
+ * Output size (in bits) for HAVAL-128/5.
+ */
+#define SPH_SIZE_haval128_5   128
+
+/**
+ * Output size (in bits) for HAVAL-160/3.
+ */
+#define SPH_SIZE_haval160_3   160
+
+/**
+ * Output size (in bits) for HAVAL-160/4.
+ */
+#define SPH_SIZE_haval160_4   160
+
+/**
+ * Output size (in bits) for HAVAL-160/5.
+ */
+#define SPH_SIZE_haval160_5   160
+
+/**
+ * Output size (in bits) for HAVAL-192/3.
+ */
+#define SPH_SIZE_haval192_3   192
+
+/**
+ * Output size (in bits) for HAVAL-192/4.
+ */
+#define SPH_SIZE_haval192_4   192
+
+/**
+ * Output size (in bits) for HAVAL-192/5.
+ */
+#define SPH_SIZE_haval192_5   192
+
+/**
+ * Output size (in bits) for HAVAL-224/3.
+ */
+#define SPH_SIZE_haval224_3   224
+
+/**
+ * Output size (in bits) for HAVAL-224/4.
+ */
+#define SPH_SIZE_haval224_4   224
+
+/**
+ * Output size (in bits) for HAVAL-224/5.
+ */
+#define SPH_SIZE_haval224_5   224
+
+/**
+ * Output size (in bits) for HAVAL-256/3.
+ */
+#define SPH_SIZE_haval256_3   256
+
+/**
+ * Output size (in bits) for HAVAL-256/4.
+ */
+#define SPH_SIZE_haval256_4   256
+
+/**
+ * Output size (in bits) for HAVAL-256/5.
+ */
+#define SPH_SIZE_haval256_5   256
+
+/**
+ * This structure is a context for HAVAL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a HAVAL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running HAVAL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	sph_u32 s0, s1, s2, s3, s4, s5, s6, s7;
+	unsigned olen, passes;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_haval_context;
+
+/**
+ * Type for a HAVAL-128/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_3_context;
+
+/**
+ * Type for a HAVAL-128/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_4_context;
+
+/**
+ * Type for a HAVAL-128/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_5_context;
+
+/**
+ * Type for a HAVAL-160/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_3_context;
+
+/**
+ * Type for a HAVAL-160/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_4_context;
+
+/**
+ * Type for a HAVAL-160/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_5_context;
+
+/**
+ * Type for a HAVAL-192/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_3_context;
+
+/**
+ * Type for a HAVAL-192/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_4_context;
+
+/**
+ * Type for a HAVAL-192/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_5_context;
+
+/**
+ * Type for a HAVAL-224/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_3_context;
+
+/**
+ * Type for a HAVAL-224/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_4_context;
+
+/**
+ * Type for a HAVAL-224/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_5_context;
+
+/**
+ * Type for a HAVAL-256/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_3_context;
+
+/**
+ * Type for a HAVAL-256/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_4_context;
+
+/**
+ * Type for a HAVAL-256/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_5_context;
+
+/**
+ * Initialize the context for HAVAL-128/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_3_context</code> structure)
+ */
+void sph_haval128_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/3 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-128/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_4_context</code> structure)
+ */
+void sph_haval128_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/4 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-128/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_5_context</code> structure)
+ */
+void sph_haval128_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/5 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_3_context</code> structure)
+ */
+void sph_haval160_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/3 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_4_context</code> structure)
+ */
+void sph_haval160_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/4 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_5_context</code> structure)
+ */
+void sph_haval160_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/5 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_3_context</code> structure)
+ */
+void sph_haval192_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/3 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_4_context</code> structure)
+ */
+void sph_haval192_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/4 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_5_context</code> structure)
+ */
+void sph_haval192_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/5 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_3_context</code> structure)
+ */
+void sph_haval224_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/3 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_4_context</code> structure)
+ */
+void sph_haval224_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/4 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_5_context</code> structure)
+ */
+void sph_haval224_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/5 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_3_context</code> structure)
+ */
+void sph_haval256_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/3 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_4_context</code> structure)
+ */
+void sph_haval256_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/4 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_5_context</code> structure)
+ */
+void sph_haval256_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/5 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses three internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses four internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses five internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/sph/sph_sha2.h
+++ b/sph/sph_sha2.h
@ -0,0 +1,378 @@
+/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
+ *
+ * SHA-256 has been published in FIPS 180-2, now amended with a change
+ * notice to include SHA-224 as well (which is a simple variation on
+ * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
+ * standards can be found at:
+ *    http://csrc.nist.gov/publications/fips/
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_sha2.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHA2_H__
+#define SPH_SHA2_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHA-224.
+ */
+#define SPH_SIZE_sha224   224
+
+/**
+ * Output size (in bits) for SHA-256.
+ */
+#define SPH_SIZE_sha256   256
+
+/**
+ * This structure is a context for SHA-224 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-224 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-224 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_sha224_context;
+
+/**
+ * This structure is a context for SHA-256 computations. It is identical
+ * to the SHA-224 context. However, a context is initialized for SHA-224
+ * <strong>or</strong> SHA-256, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha224_context sph_sha256_context;
+
+/**
+ * Initialize a SHA-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-224 context (pointer to
+ *             a <code>sph_sha224_context</code>)
+ */
+void sph_sha224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-224 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param dst   the destination buffer
+ */
+void sph_sha224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-224 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+/**
+ * Initialize a SHA-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-256 context (pointer to
+ *             a <code>sph_sha256_context</code>)
+ */
+void sph_sha256_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-256. This function is identical to
+ * <code>sha_224()</code>
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha256(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256   sph_sha224
+#endif
+
+/**
+ * Terminate the current SHA-256 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param dst   the destination buffer
+ */
+void sph_sha256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-256 compression function on the provided data. This
+ * function is identical to <code>sha224_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256_comp   sph_sha224_comp
+#endif
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for SHA-384.
+ */
+#define SPH_SIZE_sha384   384
+
+/**
+ * Output size (in bits) for SHA-512.
+ */
+#define SPH_SIZE_sha512   512
+
+/**
+ * This structure is a context for SHA-384 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-384 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-384 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	sph_u64 val[8];
+	sph_u64 count;
+#endif
+} sph_sha384_context;
+
+/**
+ * Initialize a SHA-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-384 context (pointer to
+ *             a <code>sph_sha384_context</code>)
+ */
+void sph_sha384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-384 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param dst   the destination buffer
+ */
+void sph_sha384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-384 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 64-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 64-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8]);
+
+/**
+ * This structure is a context for SHA-512 computations. It is identical
+ * to the SHA-384 context. However, a context is initialized for SHA-384
+ * <strong>or</strong> SHA-512, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha384_context sph_sha512_context;
+
+/**
+ * Initialize a SHA-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-512 context (pointer to
+ *             a <code>sph_sha512_context</code>)
+ */
+void sph_sha512_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-512. This function is identical to
+ * <code>sph_sha384()</code>.
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha512(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512   sph_sha384
+#endif
+
+/**
+ * Terminate the current SHA-512 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param dst   the destination buffer
+ */
+void sph_sha512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-512 compression function. This function is identical to
+ * <code>sph_sha384_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha512_comp(const sph_u64 msg[16], sph_u64 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512_comp   sph_sha384_comp
+#endif
+
+#endif
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+
--- a/sph/whirlpool.c
+++ b/sph/whirlpool.c
@ -3432,7 +3432,7 @@ ROUND_FUN(whirlpool1, old1)
 * We want big-endian encoding of the message length, over 256 bits. BE64
 * triggers that. However, our block length is 512 bits, not 1024 bits.
 * Internally, our encoding/decoding is little-endian, which is not a
- * problem here since we also deactivate output in sph_x15_helper.c.
+ * problem here since we also deactivate output in md_helper.c.
 */
 #define BE64   1
 #define SVAL   sc->state
@ -3441,19 +3441,19 @@ ROUND_FUN(whirlpool1, old1)

 #define RFUN   whirlpool_round
 #define HASH   whirlpool
-#include "x15_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

 #define RFUN   whirlpool0_round
 #define HASH   whirlpool0
-#include "x15_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

 #define RFUN   whirlpool1_round
 #define HASH   whirlpool1
-#include "x15_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

@ -3477,4 +3477,4 @@ MAKE_CLOSE(whirlpool1)

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/util.c
+++ b/util.c
@ -1410,5 +1410,9 @@ void print_hash_tests(void)
 	x15hash(&hash[0], &buf[0]);
 	printf("\nX15:     "); print_hash(hash);

+	memset(hash, 0, sizeof hash);
+	x17hash(&hash[0], &buf[0]);
+	printf("\nX17:     "); print_hash(hash);
+
 	printf("\n");
 }
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@ -2282,7 +2282,7 @@ static uint64_t ROUND_ELT(const uint64_t* __restrict__ sharedMemory, uint64_t in


 __global__
-void whirlpool512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint64_t sharedMemory[2048];

@ -2376,7 +2376,7 @@ void whirlpool512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas
 }

 __global__
-void whirlpool512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint64_t sharedMemory[2048];

@ -2451,7 +2451,7 @@ void whirlpool512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_has
 }

 __global__
-void whirlpool512_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce)
+void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce)
 {
 	__shared__ uint64_t sharedMemory[2048];

@ -2606,7 +2606,7 @@ extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNou

 	size_t shared_size = 0;

-	whirlpool512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x15_whirlpool_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);

 	MyStreamSynchronize(NULL, order, thr_id);
 }
@ -2623,7 +2623,7 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t

 	cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));

-	whirlpool512_gpu_finalhash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]);
+	oldwhirlpool_gpu_finalhash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]);

 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(d_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
@ -2642,7 +2642,7 @@ void whirlpool512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uin

 	size_t shared_size = 0;

-	whirlpool512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
+	oldwhirlpool_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);

 	MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x17/cuda_x17_haval512.cu
+++ b/x17/cuda_x17_haval512.cu
@ -0,0 +1,403 @@
+/*
+ * Haval-512 for X17
+ *
+ * Built on cbuchner1's implementation, actual hashing code
+ * heavily based on phm's sgminer
+ *
+ */
+
+/*
+ * Haval-512 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014  djm34
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   phm <phm@inbox.com>
+ */
+#include <stdio.h>
+#include <memory.h>
+
+#define USE_SHARED 1
+
+#include "cuda_helper.h"
+
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+
+// in heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+static __constant__ uint32_t initVector[8];
+
+static const uint32_t c_initVector[8] = {
+	SPH_C32(0x243F6A88),
+	SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E),
+	SPH_C32(0x03707344),
+	SPH_C32(0xA4093822),
+	SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98),
+	SPH_C32(0xEC4E6C89)
+};
+
+#define PASS1(n, in) { \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x00000000)); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], SPH_C32(0x00000000)); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], SPH_C32(0x00000000)); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x00000000)); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x00000000)); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], SPH_C32(0x00000000)); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], SPH_C32(0x00000000)); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], SPH_C32(0x00000000)); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], SPH_C32(0x00000000)); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x00000000)); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], SPH_C32(0x00000000)); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], SPH_C32(0x00000000)); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], SPH_C32(0x00000000)); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], SPH_C32(0x00000000)); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], SPH_C32(0x00000000)); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x00000000)); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], SPH_C32(0x00000000)); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], SPH_C32(0x00000000)); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], SPH_C32(0x00000000)); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], SPH_C32(0x00000000)); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], SPH_C32(0x00000000)); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], SPH_C32(0x00000000)); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], SPH_C32(0x00000000)); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0x00000000)); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x00000000)); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], SPH_C32(0x00000000)); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0x00000000)); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], SPH_C32(0x00000000)); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0x00000000)); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], SPH_C32(0x00000000)); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], SPH_C32(0x00000000)); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], SPH_C32(0x00000000)); \
+}
+
+#define PASS2(n, in) { \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x452821E6)); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0x38D01377)); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0xBE5466CF)); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], SPH_C32(0x34E90C6C)); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], SPH_C32(0xC0AC29B7)); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], SPH_C32(0xC97C50DD)); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], SPH_C32(0x3F84D5B5)); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], SPH_C32(0xB5470917)); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x9216D5D9)); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0x8979FB1B)); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], SPH_C32(0xD1310BA6)); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0x98DFB5AC)); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0x2FFD72DB)); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xD01ADFB7)); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], SPH_C32(0xB8E1AFED)); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], SPH_C32(0x6A267E96)); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], SPH_C32(0xBA7C9045)); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0xF12C7F99)); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x24A19947)); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], SPH_C32(0xB3916CF7)); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x0801F2E2)); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], SPH_C32(0x858EFC16)); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], SPH_C32(0x636920D8)); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], SPH_C32(0x71574E69)); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0xA458FEA3)); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], SPH_C32(0xF4933D7E)); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], SPH_C32(0x0D95748F)); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], SPH_C32(0x728EB658)); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0x718BCD58)); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0x82154AEE)); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x7B54A41D)); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0xC25A59B5)); \
+}
+
+#define PASS3(n, in) { \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x9C30D539)); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x2AF26013)); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], SPH_C32(0xC5D1B023)); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0x286085F0)); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0xCA417918)); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], SPH_C32(0xB8DB38EF)); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], SPH_C32(0x8E79DCB0)); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], SPH_C32(0x603A180E)); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], SPH_C32(0x6C9E0E8B)); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0xB01E8A3E)); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], SPH_C32(0xD71577C1)); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], SPH_C32(0xBD314B27)); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], SPH_C32(0x78AF2FDA)); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], SPH_C32(0x55605C60)); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0xE65525F3)); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], SPH_C32(0xAA55AB94)); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], SPH_C32(0x57489862)); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], SPH_C32(0x63E81440)); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], SPH_C32(0x55CA396A)); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x2AAB10B6)); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0xB4CC5C34)); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], SPH_C32(0x1141E8CE)); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], SPH_C32(0xA15486AF)); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0x7C72E993)); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], SPH_C32(0xB3EE1411)); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x636FBC2A)); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x2BA9C55D)); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], SPH_C32(0x741831F6)); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], SPH_C32(0xCE5C3E16)); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x9B87931E)); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], SPH_C32(0xAFD6BA33)); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], SPH_C32(0x6C24CF5C)); \
+}
+
+#define PASS4(n, in) { \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x7A325381)); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], SPH_C32(0x28958677)); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], SPH_C32(0x3B8F4898)); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], SPH_C32(0x6B4BB9AF)); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0xC4BFE81B)); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], SPH_C32(0x66282193)); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x61D809CC)); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0xFB21A991)); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], SPH_C32(0x487CAC60)); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x5DEC8032)); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], SPH_C32(0xEF845D5D)); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0xE98575B1)); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDC262302)); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0xEB651B88)); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], SPH_C32(0x23893E81)); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], SPH_C32(0xD396ACC5)); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], SPH_C32(0x0F6D6FF3)); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], SPH_C32(0x83F44239)); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], SPH_C32(0x2E0B4482)); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], SPH_C32(0xA4842004)); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], SPH_C32(0x69C8F04A)); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], SPH_C32(0x9E1F9B5E)); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], SPH_C32(0x21C66842)); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], SPH_C32(0xF6E96C9A)); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], SPH_C32(0x670C9C61)); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], SPH_C32(0xABD388F0)); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], SPH_C32(0x6A51A0D2)); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], SPH_C32(0xD8542F68)); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x960FA728)); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xAB5133A3)); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0x6EEF0B6C)); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], SPH_C32(0x137A3BE4)); \
+}
+
+#define PASS5(n, in) { \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], SPH_C32(0xBA3BF050)); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0x7EFB2A98)); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0xA1F1651D)); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], SPH_C32(0x39AF0176)); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x66CA593E)); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x82430E88)); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], SPH_C32(0x8CEE8619)); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], SPH_C32(0x456F9FB4)); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x7D84A5C3)); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], SPH_C32(0x3B8B5EBE)); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], SPH_C32(0xE06F75D8)); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], SPH_C32(0x85C12073)); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], SPH_C32(0x401A449F)); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], SPH_C32(0x56C16AA6)); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x4ED3AA62)); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], SPH_C32(0x363F7706)); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x1BFEDF72)); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x429B023D)); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], SPH_C32(0x37D0D724)); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], SPH_C32(0xD00A1248)); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDB0FEAD3)); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], SPH_C32(0x49F1C09B)); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x075372C9)); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], SPH_C32(0x80991B7B)); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], SPH_C32(0x25D479D8)); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0xF6E8DEF7)); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], SPH_C32(0xE3FE501A)); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0xB6794C3B)); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x976CE0BD)); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], SPH_C32(0x04C006BA)); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], SPH_C32(0xC1A94FB6)); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x409F60C4)); \
+}
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
+	^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
+	^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
+	^ ((x2) & (x6)) ^ (x0))
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+		uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+		(x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	}
+
+
+__global__
+void x17_haval256_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
+		int hashPosition = nounce - startNounce;
+		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
+		union {
+			uint8_t h1[64];
+			uint32_t h4[16];
+			uint64_t h8[8];
+		} hash;
+
+		uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
+		uint32_t s0,s1,s2,s3,s4,s5,s6,s7;
+		uint32_t buf[32];
+
+		s0 = initVector[0];
+		s1 = initVector[1];
+		s2 = initVector[2];
+		s3 = initVector[3];
+		s4 = initVector[4];
+		s5 = initVector[5];
+		s6 = initVector[6];
+		s7 = initVector[7];
+
+		u0 = s0;
+		u1 = s1;
+		u2 = s2;
+		u3 = s3;
+		u4 = s4;
+		u5 = s5;
+		u6 = s6;
+		u7 = s7;
+
+		#pragma unroll
+		for (int i=0; i<16; i++) {
+			hash.h4[i]= inpHash[i];
+		}
+
+///////// input big /////////////////////
+
+		#pragma unroll
+		for (int i=0; i<32; i++) {
+			if (i<16) {
+				buf[i]=hash.h4[i];
+			} else {
+				buf[i]=0;
+			}
+		}
+
+		buf[16]=0x00000001;
+		buf[29]=0x40290000;
+		buf[30]=0x00000200;
+
+		PASS1(5, buf);
+		PASS2(5, buf);
+		PASS3(5, buf);
+		PASS4(5, buf);
+		PASS5(5, buf);
+
+		s0 = SPH_T32(s0 + u0);
+		s1 = SPH_T32(s1 + u1);
+		s2 = SPH_T32(s2 + u2);
+		s3 = SPH_T32(s3 + u3);
+		s4 = SPH_T32(s4 + u4);
+		s5 = SPH_T32(s5 + u5);
+		s6 = SPH_T32(s6 + u6);
+		s7 = SPH_T32(s7 + u7);
+
+		hash.h4[0]=s0;
+		hash.h4[1]=s1;
+			  hash.h4[2]=s2;
+		hash.h4[3]=s3;
+			  hash.h4[4]=s4;
+		hash.h4[5]=s5;
+		hash.h4[6]=s6;
+		hash.h4[7]=s7;
+
+		#pragma unroll 16
+		for (int u = 0; u < 16; u ++)
+			inpHash[u] = hash.h4[u];
+	} // threads
+}
+
+__host__
+void x17_haval256_cpu_init(int thr_id, int threads)
+{
+	cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x17_haval256_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	x17_haval256_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@ -0,0 +1,240 @@
+/**
+ * sha512 djm34
+ * (cleaned by tpruvot)
+ */
+
+/*
+ * sha-512 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014 djm34
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author phm <phm@inbox.com>
+ */
+#include <stdio.h>
+
+#define USE_SHARED 1
+
+#include "cuda_helper.h"
+
+#define SWAP64(u64) cuda_swab64(u64)
+
+#define SPH_ROTL32(x, n)  SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)  SPH_ROTL32(x, (32 - (n)))
+
+#define SPH_T32(x)  ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_T64(x)  ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+
+// in heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+static __constant__ uint64_t H_512[8];
+
+static const uint64_t H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+static __constant__ uint64_t K_512[80];
+
+static const uint64_t K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+
+#define SHA3_STEP(ord,r,i) { \
+		uint64_t T1, T2; \
+		int a = 8-ord; \
+		T1 = SPH_T64(r[(7+a)%8] + BSG5_1(r[(4+a)%8]) + CH(r[(4+a)%8], r[(5+a)%8], r[(6+a)%8]) + K_512[i] + W[i]); \
+		T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \
+		r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \
+		r[(7+a)%8] = SPH_T64(T1 + T2); \
+	}
+
+#define SHA3_STEP2(truc,ord,r,i) { \
+		uint64_t T1, T2; \
+		int a = 8-ord; \
+		T1 = Tone(truc,r,W,a,i); \
+		T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \
+		r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \
+		r[(7+a)%8] = SPH_T64(T1 + T2); \
+	}
+//#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define BSG5_0(x)        xor3(ROTR64(x, 28),ROTR64(x, 34),ROTR64(x, 39))
+
+//#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define BSG5_1(x)      xor3(ROTR64(x, 14),ROTR64(x, 18),ROTR64(x, 41))
+
+//#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
+#define SSG5_0(x)      xor3(ROTR64(x, 1),ROTR64(x, 8),shr_t64(x,7))
+
+//#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
+#define SSG5_1(x)      xor3(ROTR64(x, 19),ROTR64(x, 61),shr_t64(x,6))
+
+//#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define CH(x, y, z)    xandx(x,y,z)
+//#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+#define MAJ(x, y, z)   andor(x,y,z)
+
+__device__ __forceinline__
+uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint32_t a, uint32_t i)
+{
+	uint64_t h = r[(7+a)%8];
+	uint64_t e = r[(4+a)%8];
+	uint64_t f = r[(5+a)%8];
+	uint64_t g = r[(6+a)%8];
+	//uint64_t BSG51 = ROTR64(e, 14) ^ ROTR64(e, 18) ^ ROTR64(e, 41);
+	uint64_t BSG51 = xor3(ROTR64(e, 14),ROTR64(e, 18),ROTR64(e, 41));
+	//uint64_t CHl     = (((f) ^ (g)) & (e)) ^ (g);
+	uint64_t CHl = xandx(e,f,g);
+	uint64_t result = SPH_T64(h+BSG51+CHl+sharedMemory[i]+W[i]);
+	return result;
+}
+
+__global__
+void x17_sha512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		int hashPosition = nounce - startNounce;
+		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
+		union {
+			uint8_t h1[64];
+			uint32_t h4[16];
+			uint64_t h8[8];
+		} hash;
+
+		#pragma unroll
+		for (int i=0;i<16;i++) {
+			hash.h4[i]= inpHash[i];
+		}
+		uint64_t W[80];
+		uint64_t r[8];
+
+		#pragma unroll 71
+		for (int i=9;i<80;i++) {
+			W[i]=0;
+		}
+
+		#pragma unroll
+		for (int i = 0; i < 8; i ++) {
+			W[i] = SWAP64(hash.h8[i]);
+			r[i] = H_512[i];
+		}
+
+		W[8] = 0x8000000000000000;
+		W[15]= 0x0000000000000200;
+
+		#pragma unroll 64
+		for (int i = 16; i < 80; i ++)
+			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7]
+			     + SSG5_0(W[i - 15]) + W[i - 16]);
+
+		#pragma unroll 10
+		for (int i = 0; i < 80; i += 8) {
+			#pragma unroll 8
+			for (int ord=0;ord<8;ord++) {
+				SHA3_STEP2(K_512,ord,r,i+ord);
+			}
+		}
+
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			r[i] = SPH_T64(r[i] + H_512[i]);
+		}
+
+		#pragma unroll 8
+		for(int i=0;i<8;i++) {
+			hash.h8[i] = SWAP64(r[i]);
+		}
+
+		#pragma unroll 16
+		for (int u = 0; u < 16; u ++) {
+			inpHash[u] = hash.h4[u];
+		}
+	}
+}
+
+__host__
+void x17_sha512_cpu_init(int thr_id, int threads)
+{
+	cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x17_sha512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size =0;
+	x17_sha512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -0,0 +1,306 @@
+/*
+ * X17 algorithm built on cbuchner1's original X11
+ *
+ */
+
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+
+#include "miner.h"
+}
+
+static uint32_t *d_hash[8];
+
+
+// cpu-miner.c
+extern int device_map[8];
+extern bool opt_benchmark;
+
+
+extern void quark_blake512_cpu_init(int thr_id, int threads);
+extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_bmw512_cpu_init(int thr_id, int threads);
+extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_groestl512_cpu_init(int thr_id, int threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_skein512_cpu_init(int thr_id, int threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_keccak512_cpu_init(int thr_id, int threads);
+extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_jh512_cpu_init(int thr_id, int threads);
+extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_luffa512_cpu_init(int thr_id, int threads);
+extern void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_cubehash512_cpu_init(int thr_id, int threads);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_shavite512_cpu_init(int thr_id, int threads);
+extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_echo512_cpu_init(int thr_id, int threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, int threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, int threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x14_shabal512_cpu_init(int thr_id, int threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, int threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x17_sha512_cpu_init(int thr_id, int threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x17_haval256_cpu_init(int thr_id, int threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+extern void quark_compactTest_cpu_init(int thr_id, int threads);
+extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
+											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
+											int order);
+
+// X17 Hashfunktion
+extern "C" void x17hash(void *output, const void *input)
+{
+	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13-shabal14-whirlpool15-sha512-haval17
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+	sph_haval256_5_context ctx_haval;
+
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashB hash+64
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*) hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool (&ctx_whirlpool, (const void*) hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512,(const void*) hash, 64);
+	sph_sha512_close(&ctx_sha512,(void*) hash);
+
+	sph_haval256_5_init(&ctx_haval);
+	sph_haval256_5(&ctx_haval,(const void*) hash, 64);
+	sph_haval256_5_close(&ctx_haval,hash);
+
+	memcpy(output, hash, 32);
+}
+
+
+extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	const int throughput = 256*256*8;
+
+		if (opt_benchmark)
+				((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+	uint32_t Htarg = ptarget[7];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x17_sha512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (foundNonce != 0xffffffff)
+		{
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			x17hash(vhash64, endiandata);
+
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
+				pdata[19] = foundNonce;
+				*hashes_done = foundNonce - first_nonce + 1;
+				return 1;
+			}
+			else if (vhash64[7] > Htarg) {
+				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
+			}
+			else {
+				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}