Merge pull request #307 from sgminer-dev/v5_0-x15

V5 0 x15
11 years ago · f3a773f279
29 changed files with 13094 additions and 363 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -62,6 +62,8 @@ sgminer_SOURCES += algorithm/twecoin.c algorithm/twecoin.h
				@@ -62,6 +62,8 @@ sgminer_SOURCES += algorithm/twecoin.c algorithm/twecoin.h
 sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h
 sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h
 sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
+sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
+sgminer_SOURCES += algorithm/x14.c algorithm/x14.h

 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl

--- a/algorithm.c
+++ b/algorithm.c
@ -26,6 +26,8 @@
				@@ -26,6 +26,8 @@
 #include "algorithm/marucoin.h"
 #include "algorithm/maxcoin.h"
 #include "algorithm/talkcoin.h"
+#include "algorithm/bitblock.h"
+#include "algorithm/x14.h"

 #include "compat.h"

@ -39,6 +41,8 @@ const char *algorithm_type_str[] = {
				@@ -39,6 +41,8 @@ const char *algorithm_type_str[] = {
  "NScrypt",
  "X11",
  "X13",
+  "X14",
+  "X15",
  "Keccak",
  "Quarkcoin",
  "Twecoin",
@ -90,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
				@@ -90,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
 static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
  char buf[255];
-  sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d",
-          opt_hamsi_expand_big);
+  sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d -D SPH_HAMSI_SHORT=%d ",
+          opt_hamsi_expand_big, ((opt_hamsi_short)?1:0));
  strcat(data->compiler_options, buf);

-  sprintf(buf, "big%u", (unsigned int)opt_hamsi_expand_big);
+  sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":""));
  strcat(data->binary_filename, buf);
 }

@ -197,6 +201,103 @@ static cl_int queue_darkcoin_mod_kernel(struct __clState *clState, struct _dev_b
				@@ -197,6 +201,103 @@ static cl_int queue_darkcoin_mod_kernel(struct __clState *clState, struct _dev_b
  return status;
 }

+static cl_int queue_bitblock_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // echo - search10
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // hamsi - search11
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // fugue - search12
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // hamsi - search11
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // fugue - search12
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
+static cl_int queue_bitblockold_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // combined echo, hamsi, fugue - shabal - whirlpool - search10
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
+
 static cl_int queue_marucoin_mod_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
  cl_kernel *kernel;
@ -321,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
				@@ -321,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
  return status;
 }

+static cl_int queue_x14_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // echo - search10
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // hamsi - search11
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // fugue - search12
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shabal - search13
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
+static cl_int queue_x14_old_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // combined echo, hamsi, fugue - shabal - search10
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
 typedef struct _algorithm_settings_t {
  const char *name; /* Human-readable identifier */
  algorithm_type_t type; //common algorithm type
@ -379,8 +574,13 @@ static algorithm_settings_t algos[] = {
				@@ -379,8 +574,13 @@ static algorithm_settings_t algos[] = {
  { "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options},
  { "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options},

-  { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4,  8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
+  { "x14", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_hamsi_compiler_options},
+  { "x14old", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_hamsi_compiler_options},

+  { "bitblock", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_hamsi_compiler_options},
+  { "bitblockold", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_hamsi_compiler_options},
+
+  { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4,  8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
  // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
 #define A_FUGUE(a, b) \
    { a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL}
--- a/algorithm.h
+++ b/algorithm.h
@ -16,6 +16,8 @@ typedef enum {
				@@ -16,6 +16,8 @@ typedef enum {
  ALGO_NSCRYPT,
  ALGO_X11,
  ALGO_X13,
+  ALGO_X14,
+  ALGO_X15,
  ALGO_KECCAK,
  ALGO_QUARK,
  ALGO_TWE,
--- a/algorithm/bitblock.c
+++ b/algorithm/bitblock.c
@ -0,0 +1,253 @@
				@@ -0,0 +1,253 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+
+/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
+typedef struct {
+    sph_blake512_context    blake1;
+    sph_bmw512_context      bmw1;
+    sph_groestl512_context  groestl1;
+    sph_skein512_context    skein1;
+    sph_jh512_context       jh1;
+    sph_keccak512_context   keccak1;
+    sph_luffa512_context    luffa1;
+    sph_cubehash512_context cubehash1;
+    sph_shavite512_context  shavite1;
+    sph_simd512_context     simd1;
+    sph_echo512_context     echo1;
+    sph_hamsi512_context    hamsi1;
+    sph_fugue512_context    fugue1;
+	sph_shabal512_context	shabal1;
+	sph_whirlpool_context	whilpool1;
+} Xhash_context_holder;
+
+static Xhash_context_holder base_contexts;
+
+
+void init_Bhash_contexts()
+{
+    sph_blake512_init(&base_contexts.blake1);   
+    sph_bmw512_init(&base_contexts.bmw1);   
+    sph_groestl512_init(&base_contexts.groestl1);   
+    sph_skein512_init(&base_contexts.skein1);   
+    sph_jh512_init(&base_contexts.jh1);     
+    sph_keccak512_init(&base_contexts.keccak1); 
+    sph_luffa512_init(&base_contexts.luffa1);
+    sph_cubehash512_init(&base_contexts.cubehash1);
+    sph_shavite512_init(&base_contexts.shavite1);
+    sph_simd512_init(&base_contexts.simd1);
+    sph_echo512_init(&base_contexts.echo1);
+    sph_hamsi512_init(&base_contexts.hamsi1);
+    sph_fugue512_init(&base_contexts.fugue1);
+	sph_shabal512_init(&base_contexts.shabal1);
+	sph_whirlpool_init(&base_contexts.whilpool1);
+}
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+
+inline void bitblockhash(void *state, const void *input)
+{
+    init_Bhash_contexts();
+    
+    Xhash_context_holder ctx;
+    
+    uint32_t hashA[16], hashB[16];  
+    //blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo
+    memcpy(&ctx, &base_contexts, sizeof(base_contexts));
+    
+    sph_blake512 (&ctx.blake1, input, 80);
+    sph_blake512_close (&ctx.blake1, hashA);        
+
+    sph_bmw512 (&ctx.bmw1, hashA, 64);    
+    sph_bmw512_close(&ctx.bmw1, hashB);     
+  
+    sph_groestl512 (&ctx.groestl1, hashB, 64); 
+    sph_groestl512_close(&ctx.groestl1, hashA);
+   
+    sph_skein512 (&ctx.skein1, hashA, 64); 
+    sph_skein512_close(&ctx.skein1, hashB); 
+   
+    sph_jh512 (&ctx.jh1, hashB, 64); 
+    sph_jh512_close(&ctx.jh1, hashA);
+  
+    sph_keccak512 (&ctx.keccak1, hashA, 64); 
+    sph_keccak512_close(&ctx.keccak1, hashB);
+    
+    sph_luffa512 (&ctx.luffa1, hashB, 64);
+    sph_luffa512_close (&ctx.luffa1, hashA);    
+        
+    sph_cubehash512 (&ctx.cubehash1, hashA, 64);   
+    sph_cubehash512_close(&ctx.cubehash1, hashB);  
+    
+    sph_shavite512 (&ctx.shavite1, hashB, 64);   
+    sph_shavite512_close(&ctx.shavite1, hashA);  
+    
+    sph_simd512 (&ctx.simd1, hashA, 64);   
+    sph_simd512_close(&ctx.simd1, hashB); 
+    
+    sph_echo512 (&ctx.echo1, hashB, 64);   
+    sph_echo512_close(&ctx.echo1, hashA);    
+
+    sph_hamsi512 (&ctx.hamsi1, hashA, 64);   
+    sph_hamsi512_close(&ctx.hamsi1, hashB);    
+
+    sph_fugue512 (&ctx.fugue1, hashB, 64);   
+    sph_fugue512_close(&ctx.fugue1, hashA);    
+
+	sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64);
+	sph_shabal512_close(&ctx.shabal1, hashB);
+
+	sph_whirlpool(&ctx.whilpool1, hashB, 64);
+	sph_whirlpool_close(&ctx.whilpool1, hashA);
+
+    memcpy(state, hashA, 32);
+
+}
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+
+/* Used externally as confirmation of correct OCL code */
+int bitblock_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	bitblockhash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void bitblock_regenhash(struct work *work)
+{
+        uint32_t data[20];
+        uint32_t *nonce = (uint32_t *)(work->data + 76);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        be32enc_vect(data, (const uint32_t *)work->data, 19);
+        data[19] = htobe32(*nonce);
+        bitblockhash(ohash, data);
+}
+
+static inline void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+bool scanhash_bitblock(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) {
+		uint32_t ostate[8];
+		*nonce = ++n;
+		data[19] = (n);
+		bitblockhash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
+
--- a/algorithm/bitblock.h
+++ b/algorithm/bitblock.h
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+#ifndef BITBLOCK_H
+#define BITBLOCK_H
+
+#include "miner.h"
+
+extern int bitblock_test(unsigned char *pdata, const unsigned char *ptarget,
+			uint32_t nonce);
+extern void bitblock_regenhash(struct work *work);
+
+#endif /* BITBLOCK_H */
--- a/algorithm/x14.c
+++ b/algorithm/x14.c
@ -0,0 +1,247 @@
				@@ -0,0 +1,247 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+
+/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
+typedef struct {
+  sph_blake512_context    blake1;
+  sph_bmw512_context      bmw1;
+  sph_groestl512_context  groestl1;
+  sph_skein512_context    skein1;
+  sph_jh512_context       jh1;
+  sph_keccak512_context   keccak1;
+  sph_luffa512_context    luffa1;
+  sph_cubehash512_context cubehash1;
+  sph_shavite512_context  shavite1;
+  sph_simd512_context     simd1;
+  sph_echo512_context     echo1;
+  sph_hamsi512_context    hamsi1;
+  sph_fugue512_context    fugue1;
+  sph_shabal512_context	shabal1;
+} Xhash_context_holder;
+
+static Xhash_context_holder base_contexts;
+
+void init_X14hash_contexts()
+{
+  sph_blake512_init(&base_contexts.blake1);   
+  sph_bmw512_init(&base_contexts.bmw1);   
+  sph_groestl512_init(&base_contexts.groestl1);   
+  sph_skein512_init(&base_contexts.skein1);   
+  sph_jh512_init(&base_contexts.jh1);     
+  sph_keccak512_init(&base_contexts.keccak1); 
+  sph_luffa512_init(&base_contexts.luffa1);
+  sph_cubehash512_init(&base_contexts.cubehash1);
+  sph_shavite512_init(&base_contexts.shavite1);
+  sph_simd512_init(&base_contexts.simd1);
+  sph_echo512_init(&base_contexts.echo1);
+  sph_hamsi512_init(&base_contexts.hamsi1);
+  sph_fugue512_init(&base_contexts.fugue1);
+  sph_shabal512_init(&base_contexts.shabal1);
+}
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+  for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+
+inline void x14hash(void *state, const void *input)
+{
+  init_X14hash_contexts();
+  
+  Xhash_context_holder ctx;
+  
+  uint32_t hashA[16], hashB[16];  
+  //blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo
+  memcpy(&ctx, &base_contexts, sizeof(base_contexts));
+  
+  sph_blake512 (&ctx.blake1, input, 80);
+  sph_blake512_close (&ctx.blake1, hashA);        
+
+  sph_bmw512 (&ctx.bmw1, hashA, 64);    
+  sph_bmw512_close(&ctx.bmw1, hashB);     
+
+  sph_groestl512 (&ctx.groestl1, hashB, 64); 
+  sph_groestl512_close(&ctx.groestl1, hashA);
+ 
+  sph_skein512 (&ctx.skein1, hashA, 64); 
+  sph_skein512_close(&ctx.skein1, hashB); 
+ 
+  sph_jh512 (&ctx.jh1, hashB, 64); 
+  sph_jh512_close(&ctx.jh1, hashA);
+
+  sph_keccak512 (&ctx.keccak1, hashA, 64); 
+  sph_keccak512_close(&ctx.keccak1, hashB);
+  
+  sph_luffa512 (&ctx.luffa1, hashB, 64);
+  sph_luffa512_close (&ctx.luffa1, hashA);    
+      
+  sph_cubehash512 (&ctx.cubehash1, hashA, 64);   
+  sph_cubehash512_close(&ctx.cubehash1, hashB);  
+  
+  sph_shavite512 (&ctx.shavite1, hashB, 64);   
+  sph_shavite512_close(&ctx.shavite1, hashA);  
+  
+  sph_simd512 (&ctx.simd1, hashA, 64);   
+  sph_simd512_close(&ctx.simd1, hashB); 
+  
+  sph_echo512 (&ctx.echo1, hashB, 64);   
+  sph_echo512_close(&ctx.echo1, hashA);    
+
+  sph_hamsi512 (&ctx.hamsi1, hashA, 64);   
+  sph_hamsi512_close(&ctx.hamsi1, hashB);    
+
+  sph_fugue512 (&ctx.fugue1, hashB, 64);   
+  sph_fugue512_close(&ctx.fugue1, hashA);    
+
+  sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64);
+  sph_shabal512_close(&ctx.shabal1, hashB);
+
+  memcpy(state, hashB, 32);
+}
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+/* Used externally as confirmation of correct OCL code */
+int x14_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	x14hash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+        
+	if (tmp_hash7 > diff1targ)
+		return -1;
+    
+	if (tmp_hash7 > Htarg)
+		return 0;
+    
+	return 1;
+}
+
+void x14_regenhash(struct work *work)
+{
+  uint32_t data[20];
+  uint32_t *nonce = (uint32_t *)(work->data + 76);
+  uint32_t *ohash = (uint32_t *)(work->hash);
+
+  be32enc_vect(data, (const uint32_t *)work->data, 19);
+  data[19] = htobe32(*nonce);
+  x14hash(ohash, data);
+}
+
+static inline void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+bool scanhash_x14(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) 
+  {
+		uint32_t ostate[8];
+		*nonce = ++n;
+		data[19] = (n);
+		x14hash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
+
+		if(unlikely(tmp_hash7 <= Htarg)) 
+    {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) 
+    {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
--- a/algorithm/x14.h
+++ b/algorithm/x14.h
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+#ifndef X14_H
+#define X14_H
+
+#include "miner.h"
+
+extern int x14_test(unsigned char *pdata, const unsigned char *ptarget,
+			uint32_t nonce);
+extern void x14_regenhash(struct work *work);
+
+#endif /* X14_H */
--- a/kernel/bitblock.cl
+++ b/kernel/bitblock.cl
--- a/kernel/bitblockold.cl
+++ b/kernel/bitblockold.cl
--- a/kernel/darkcoin-mod.cl
+++ b/kernel/darkcoin-mod.cl
@ -95,8 +95,6 @@
				@@ -95,8 +95,6 @@
 #include "shavite.cl"
 #include "simd.cl"
 #include "echo.cl"
-#include "hamsi.cl"
-#include "fugue.cl"

 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
 #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search1(__global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
+ uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

  // bmw
@ -456,93 +454,69 @@ __kernel void search2(__global hash_t* hashes)
				@@ -456,93 +454,69 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  #if !SPH_SMALL_FOOTPRINT_GROESTL
-    __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
-    __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
-  #else
-    __local sph_u64 T0_C[256], T4_C[256];
-  #endif
-  
+  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
+
  int init = get_local_id(0);
  int step = get_local_size(0);
-  
+
  for (int i = init; i < 256; i += step)
  {
-    T0_C[i] = T0[i];
-    T4_C[i] = T4[i];
-    #if !SPH_SMALL_FOOTPRINT_GROESTL
-      T1_C[i] = T1[i];
-      T2_C[i] = T2[i];
-      T3_C[i] = T3[i];
-      T5_C[i] = T5[i];
-      T6_C[i] = T6[i];
-      T7_C[i] = T7[i];
-    #endif
+    T0_L[i] = T0[i];
+    T4_L[i] = T4[i];
+    T1_L[i] = T1[i];
+    T2_L[i] = T2[i];
+    T3_L[i] = T3[i];
+    T5_L[i] = T5[i];
+    T6_L[i] = T6[i];
+    T7_L[i] = T7[i];
  }
-  
-  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
-
-  #define T0 T0_C
-  #define T1 T1_C
-  #define T2 T2_C
-  #define T3 T3_C
-  #define T4 T4_C
-  #define T5 T5_C
-  #define T6 T6_C
-  #define T7 T7_C
-
-  sph_u64 H[16];
-  
-  for (unsigned int u = 0; u < 15; u ++)
-    H[u] = 0;
-    
-  #if USE_LE
-    H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
-  #else
-    H[15] = (sph_u64)512;
-  #endif
+ 
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  #define T0 T0_L
+  #define T1 T1_L
+  #define T2 T2_L
+  #define T3 T3_L
+  #define T4 T4_L
+  #define T5 T5_L
+  #define T6 T6_L
+  #define T7 T7_L
+ 
+  // groestl
+  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};

  sph_u64 g[16], m[16];
-  m[0] = DEC64E(hash->h8[0]);
-  m[1] = DEC64E(hash->h8[1]);
-  m[2] = DEC64E(hash->h8[2]);
-  m[3] = DEC64E(hash->h8[3]);
-  m[4] = DEC64E(hash->h8[4]);
-  m[5] = DEC64E(hash->h8[5]);
-  m[6] = DEC64E(hash->h8[6]);
-  m[7] = DEC64E(hash->h8[7]);
-  
-  for (unsigned int u = 0; u < 16; u ++)
-      g[u] = m[u] ^ H[u];
-      
-  m[8] = 0x80; g[8] = m[8] ^ H[8];
-  m[9] = 0; g[9] = m[9] ^ H[9];
-  m[10] = 0; g[10] = m[10] ^ H[10];
-  m[11] = 0; g[11] = m[11] ^ H[11];
-  m[12] = 0; g[12] = m[12] ^ H[12];
-  m[13] = 0; g[13] = m[13] ^ H[13];
-  m[14] = 0; g[14] = m[14] ^ H[14];
-  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
+  g[0] = m[0] = DEC64E(hash->h8[0]);
+  g[1] = m[1] = DEC64E(hash->h8[1]);
+  g[2] = m[2] = DEC64E(hash->h8[2]);
+  g[3] = m[3] = DEC64E(hash->h8[3]);
+  g[4] = m[4] = DEC64E(hash->h8[4]);
+  g[5] = m[5] = DEC64E(hash->h8[5]);
+  g[6] = m[6] = DEC64E(hash->h8[6]);
+  g[7] = m[7] = DEC64E(hash->h8[7]);
+  g[8] = m[8] = 0x80;
+  g[9] = m[9] = 0;
+  g[10] = m[10] = 0;
+  g[11] = m[11] = 0;
+  g[12] = m[12] = 0;
+  g[13] = m[13] = 0;
+  g[14] = m[14] = 0;
+  g[15] = 0x102000000000000;
+  m[15] = 0x100000000000000;
  
  PERM_BIG_P(g);
  PERM_BIG_Q(m);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= g[u] ^ m[u];
-    
  sph_u64 xH[16];
-  
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u];
+    xH[u] = H[u] ^= g[u] ^ m[u];
+      
  PERM_BIG_P(xH);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= xH[u];
-    
-  for (unsigned int u = 0; u < 8; u ++)
-    hash->h8[u] = DEC64E(H[u + 8]);
-    
-  barrier(CLK_GLOBAL_MEM_FENCE);
+  for (unsigned int u = 8; u < 16; u ++)
+    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
+
+  barrier(CLK_GLOBAL_MEM_FENCE); 
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
--- a/kernel/fugue.cl
+++ b/kernel/fugue.cl
@ -30,26 +30,26 @@
				@@ -30,26 +30,26 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

-__constant static const sph_u32 IV224[] = {
+__constant const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
 	SPH_C32(0xbd8d679a)
 };

-__constant static const sph_u32 IV256[] = {
+__constant const sph_u32 IV256[] = {
 	SPH_C32(0xe952bdde), SPH_C32(0x6671135f), SPH_C32(0xe0d4f668),
 	SPH_C32(0xd2b0b594), SPH_C32(0xf96c621d), SPH_C32(0xfbf929de),
 	SPH_C32(0x9149e899), SPH_C32(0x34f8c248)
 };

-__constant static const sph_u32 IV384[] = {
+__constant const sph_u32 IV384[] = {
 	SPH_C32(0xaa61ec0d), SPH_C32(0x31252e1f), SPH_C32(0xa01db4c7),
 	SPH_C32(0x00600985), SPH_C32(0x215ef44a), SPH_C32(0x741b5e9c),
 	SPH_C32(0xfa693e9a), SPH_C32(0x473eb040), SPH_C32(0xe502ae8a),
 	SPH_C32(0xa99c25e0), SPH_C32(0xbc95517c), SPH_C32(0x5c1095a1)
 };

-__constant static const sph_u32 IV512[] = {
+__constant const sph_u32 IV512[] = {
 	SPH_C32(0x8807a57e), SPH_C32(0xe616af75), SPH_C32(0xc5d3e4db),
 	SPH_C32(0xac9ab027), SPH_C32(0xd915f117), SPH_C32(0xb6eecc54),
 	SPH_C32(0x06e8020b), SPH_C32(0x4a92efd1), SPH_C32(0xaac6e2c9),
@ -58,7 +58,7 @@ __constant static const sph_u32 IV512[] = {
				@@ -58,7 +58,7 @@ __constant static const sph_u32 IV512[] = {
 	SPH_C32(0xe13e3567)
 };

-__constant static const sph_u32 mixtab0[] = {
+__constant const sph_u32 mixtab0_c[] = {
 	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
 	SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7),
 	SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0),
@ -147,7 +147,7 @@ __constant static const sph_u32 mixtab0[] = {
				@@ -147,7 +147,7 @@ __constant static const sph_u32 mixtab0[] = {
 	SPH_C32(0x16166258)
 };

-__constant static const sph_u32 mixtab1[] = {
+__constant const sph_u32 mixtab1_c[] = {
 	SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e),
 	SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a),
 	SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090),
@ -236,7 +236,7 @@ __constant static const sph_u32 mixtab1[] = {
				@@ -236,7 +236,7 @@ __constant static const sph_u32 mixtab1[] = {
 	SPH_C32(0x58161662)
 };

-__constant static const sph_u32 mixtab2[] = {
+__constant const sph_u32 mixtab2_c[] = {
 	SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777),
 	SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b),
 	SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030),
@ -325,7 +325,7 @@ __constant static const sph_u32 mixtab2[] = {
				@@ -325,7 +325,7 @@ __constant static const sph_u32 mixtab2[] = {
 	SPH_C32(0x62581616)
 };

-__constant static const sph_u32 mixtab3[] = {
+__constant const sph_u32 mixtab3_c[] = {
 	SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777),
 	SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b),
 	SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030),
--- a/kernel/fuguecoin.cl
+++ b/kernel/fuguecoin.cl
@ -79,6 +79,19 @@ __kernel void search(__global unsigned char* input, volatile __global uint* outp
				@@ -79,6 +79,19 @@ __kernel void search(__global unsigned char* input, volatile __global uint* outp
 {
    uint gid = get_global_id(0);

+    //mixtab
+    __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
+    int init = get_local_id(0);
+    int step = get_local_size(0);
+    for (int i = init; i < 256; i += step)
+    {
+      mixtab0[i] = mixtab0_c[i];
+      mixtab1[i] = mixtab1_c[i];
+      mixtab2[i] = mixtab2_c[i];
+      mixtab3[i] = mixtab3_c[i];
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE); 
+
    sph_u32 S00 = 0, S01 = 0, S02 = 0, S03 = 0, S04 = 0, S05 = 0, S06 = 0, S07 = 0, S08 = 0, S09 = 0; \
    sph_u32 S10 = 0, S11 = 0, S12 = 0, S13 = 0, S14 = 0, S15 = 0, S16 = 0, S17 = 0, S18 = 0, S19 = 0; \
    sph_u32 S20 = 0, S21 = 0, S22 = IV256[0], S23 = IV256[1], S24 = IV256[2], S25 = IV256[3], S26 = IV256[4], S27 = IV256[5], S28 = IV256[6], S29 = IV256[7];
--- a/kernel/hamsi.cl
+++ b/kernel/hamsi.cl
@ -88,22 +88,31 @@
				@@ -88,22 +88,31 @@
 */

 #if !defined SPH_HAMSI_EXPAND_SMALL
-#if SPH_SMALL_FOOTPRINT_HAMSI
-#define SPH_HAMSI_EXPAND_SMALL  4
-#else
-#define SPH_HAMSI_EXPAND_SMALL  8
-#endif
+  #if SPH_SMALL_FOOTPRINT_HAMSI
+    #define SPH_HAMSI_EXPAND_SMALL  4
+  #else
+    #define SPH_HAMSI_EXPAND_SMALL  8
+  #endif
 #endif

 #if !defined SPH_HAMSI_EXPAND_BIG
-#define SPH_HAMSI_EXPAND_BIG    8
+  #define SPH_HAMSI_EXPAND_BIG    8
 #endif

 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-#include "hamsi_helper.cl"
+//temp fix for shortened implementation of X15
+#ifdef SPH_HAMSI_SHORT
+  #if SPH_HAMSI_SHORT == 1 && SPH_HAMSI_EXPAND_BIG == 1
+    #include "hamsi_helper_big.cl"
+  #else
+    #include "hamsi_helper.cl"
+  #endif
+#else
+  #include "hamsi_helper.cl"
+#endif

 __constant static const sph_u32 HAMSI_IV224[] = {
 	SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
--- a/kernel/hamsi_helper_big.cl
+++ b/kernel/hamsi_helper_big.cl
@ -0,0 +1,515 @@
				@@ -0,0 +1,515 @@
+/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
+/*
+ * Helper code for Hamsi (input block expansion). This code is
+ * automatically generated and includes precomputed tables for
+ * expansion code which handles 2 to 8 bits at a time.
+ *
+ * This file is included from hamsi.c, and is not meant to be compiled
+ * independently.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#if SPH_HAMSI_EXPAND_BIG == 1
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+__constant const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define INPUT_BIG   do { \
+		__constant const sph_u32 *tp = &T512[0][0]; \
+		unsigned u, v; \
+		m0 = 0; \
+		m1 = 0; \
+		m2 = 0; \
+		m3 = 0; \
+		m4 = 0; \
+		m5 = 0; \
+		m6 = 0; \
+		m7 = 0; \
+		m8 = 0; \
+		m9 = 0; \
+		mA = 0; \
+		mB = 0; \
+		mC = 0; \
+		mD = 0; \
+		mE = 0; \
+		mF = 0; \
+		for (u = 0; u < 8; u ++) { \
+			unsigned db = buf(u); \
+			for (v = 0; v < 8; v ++, db >>= 1) { \
+				sph_u32 dm = SPH_T32(-(sph_u32)(db & 1)); \
+				m0 ^= dm & *tp ++; \
+				m1 ^= dm & *tp ++; \
+				m2 ^= dm & *tp ++; \
+				m3 ^= dm & *tp ++; \
+				m4 ^= dm & *tp ++; \
+				m5 ^= dm & *tp ++; \
+				m6 ^= dm & *tp ++; \
+				m7 ^= dm & *tp ++; \
+				m8 ^= dm & *tp ++; \
+				m9 ^= dm & *tp ++; \
+				mA ^= dm & *tp ++; \
+				mB ^= dm & *tp ++; \
+				mC ^= dm & *tp ++; \
+				mD ^= dm & *tp ++; \
+				mE ^= dm & *tp ++; \
+				mF ^= dm & *tp ++; \
+			} \
+		} \
+	} while (0)
+
+#define INPUT_BIG_LOCAL   do { \
+		__local sph_u32 *tp = &(T512_L[0]); \
+		unsigned u, v; \
+		m0 = 0; \
+		m1 = 0; \
+		m2 = 0; \
+		m3 = 0; \
+		m4 = 0; \
+		m5 = 0; \
+		m6 = 0; \
+		m7 = 0; \
+		m8 = 0; \
+		m9 = 0; \
+		mA = 0; \
+		mB = 0; \
+		mC = 0; \
+		mD = 0; \
+		mE = 0; \
+		mF = 0; \
+		for (u = 0; u < 8; u ++) { \
+			unsigned db = buf(u); \
+			for (v = 0; v < 8; v ++, db >>= 1) { \
+				sph_u32 dm = SPH_T32(-(sph_u32)(db & 1)); \
+				m0 ^= dm & *tp ++; \
+				m1 ^= dm & *tp ++; \
+				m2 ^= dm & *tp ++; \
+				m3 ^= dm & *tp ++; \
+				m4 ^= dm & *tp ++; \
+				m5 ^= dm & *tp ++; \
+				m6 ^= dm & *tp ++; \
+				m7 ^= dm & *tp ++; \
+				m8 ^= dm & *tp ++; \
+				m9 ^= dm & *tp ++; \
+				mA ^= dm & *tp ++; \
+				mB ^= dm & *tp ++; \
+				mC ^= dm & *tp ++; \
+				mD ^= dm & *tp ++; \
+				mE ^= dm & *tp ++; \
+				mF ^= dm & *tp ++; \
+			} \
+		} \
+	} while (0)
+
+#endif
--- a/kernel/marucoin-mod.cl
+++ b/kernel/marucoin-mod.cl
@ -33,6 +33,8 @@
				@@ -33,6 +33,8 @@
 #ifndef X13MOD_CL
 #define X13MOD_CL

+#define DEBUG(x)
+
 #if __ENDIAN_LITTLE__
  #define SPH_LITTLE_ENDIAN 1
 #else
@ -54,15 +56,15 @@ typedef int sph_s32;
				@@ -54,15 +56,15 @@ typedef int sph_s32;
 #define SPH_64 1
 #define SPH_64_TRUE 1

-#define SPH_C32(x) ((sph_u32)(x ## U))
-#define SPH_T32(x) ((as_uint(x)) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

-#define SPH_C64(x) ((sph_u64)(x ## UL))
-#define SPH_T64(x) ((as_ulong(x)) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

 #define SPH_ECHO_64 1
 #define SPH_KECCAK_64 1
@ -74,9 +76,9 @@ typedef int sph_s32;
				@@ -74,9 +76,9 @@ typedef int sph_s32;
 #define SPH_SMALL_FOOTPRINT_GROESTL 0
 #define SPH_GROESTL_BIG_ENDIAN 0
 #define SPH_CUBEHASH_UNROLL 0
-#define SPH_KECCAK_UNROLL   0
+#define SPH_KECCAK_UNROLL   1
 #if !defined SPH_HAMSI_EXPAND_BIG
-  #define SPH_HAMSI_EXPAND_BIG 4
+  #define SPH_HAMSI_EXPAND_BIG 1
 #endif

 #include "blake.cl"
@ -121,10 +123,10 @@ typedef union {
				@@ -121,10 +123,10 @@ typedef union {
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global unsigned char* block, __global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-  // blake
+    uint gid = get_global_id(0);
+    __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

+  // blake
  sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
  sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
  sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -134,11 +136,12 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -134,11 +136,12 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)

  if ((T0 = SPH_T64(T0 + 1024)) < 1024)
    T1 = SPH_T64(T1 + 1);
-    
+
  sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
  sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
  sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
  sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
+  
  M0 = DEC64BE(block + 0);
  M1 = DEC64BE(block + 8);
  M2 = DEC64BE(block + 16);
@ -169,25 +172,25 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -169,25 +172,25 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
  hash->h8[6] = H6;
  hash->h8[7] = H7;

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search1(__global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
+ uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

  // bmw
  sph_u64 BMW_H[16];
-
-  #pragma unroll 16    
+  
+#pragma unroll 16  
  for(unsigned u = 0; u < 16; u++)
    BMW_H[u] = BMW_IV512[u];

  sph_u64 mv[16],q[32];
- 	sph_u64 tmp;
- 
+	sph_u64 tmp;
+  
  mv[0] = SWAP8(hash->h8[0]);
  mv[1] = SWAP8(hash->h8[1]);
  mv[2] = SWAP8(hash->h8[2]);
@ -206,7 +209,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -206,7 +209,7 @@ __kernel void search1(__global hash_t* hashes)
  mv[15] = SPH_C64(512);
  
  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
-  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -216,7 +219,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -216,7 +219,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
-  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -226,7 +229,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -226,7 +229,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
-  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -237,63 +240,63 @@ __kernel void search1(__global hash_t* hashes)
				@@ -237,63 +240,63 @@ __kernel void search1(__global hash_t* hashes)
  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
- 
-  #pragma unroll 2
+  
+#pragma unroll 2
  for(int i=0;i<2;i++)
  {
    q[i+16] =
-    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
-    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
-    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
-    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3],  4) ^ SPH_ROTL64(q[i+3], 37)) +
-    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
-    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
-    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
-    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7],  4) ^ SPH_ROTL64(q[i+7], 37)) +
-    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
-    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
-    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
-    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11],  4) ^ SPH_ROTL64(q[i+11], 37)) +
-    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
-    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
-    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
-    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15],  4) ^ SPH_ROTL64(q[i+15], 37)) +
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+      (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+      (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+      (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+      (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+      (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+      (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+      (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+      (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+      (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+      (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+      (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+      (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+      (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+      (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+      (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
-
-  #pragma unroll 4
+  
+#pragma unroll 4
  for(int i=2;i<6;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=6;i<9;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
  }
-  
-  #pragma unroll 4
+
+#pragma unroll 4
  for(int i=9;i<13;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }
-  
-  #pragma unroll 3
+
+#pragma unroll 3
  for(int i=13;i<16;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }
- 
+
  sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
  sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];

@ -315,7 +318,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -315,7 +318,7 @@ __kernel void search1(__global hash_t* hashes)
  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);

-  #pragma unroll 16
+#pragma unroll 16
  for(int i=0;i<16;i++) 
  {
    mv[i] = BMW_H[i];
@ -323,7 +326,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -323,7 +326,7 @@ __kernel void search1(__global hash_t* hashes)
  }

  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
-  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -333,7 +336,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -333,7 +336,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
-  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -343,7 +346,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -343,7 +346,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
-  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -354,65 +357,66 @@ __kernel void search1(__global hash_t* hashes)
				@@ -354,65 +357,66 @@ __kernel void search1(__global hash_t* hashes)
  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
-   
-  #pragma unroll 2
+ 
+#pragma unroll 2
  for(int i=0;i<2;i++)
  {
    q[i+16] =
-    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
-    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
-    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
-    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3],  4) ^ SPH_ROTL64(q[i+3], 37)) +
-    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
-    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
-    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
-    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7],  4) ^ SPH_ROTL64(q[i+7], 37)) +
-    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
-    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
-    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
-    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11],  4) ^ SPH_ROTL64(q[i+11], 37)) +
-    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
-    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
-    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
-    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15],  4) ^ SPH_ROTL64(q[i+15], 37)) +
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+      (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+      (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+      (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+      (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+      (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+      (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+      (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+      (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+      (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+      (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+      (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+      (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+      (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+      (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+      (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
- 
-  #pragma unroll 4
+
+#pragma unroll 4
  for(int i=2;i<6;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=6;i<9;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 4
+#pragma unroll 4
  for(int i=9;i<13;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=13;i<16;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }
- 
+
  XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
-  XH64 =  XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+  XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+  
  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
@ -439,8 +443,8 @@ __kernel void search1(__global hash_t* hashes)
				@@ -439,8 +443,8 @@ __kernel void search1(__global hash_t* hashes)
  hash->h8[5] = SWAP8(BMW_H[13]);
  hash->h8[6] = SWAP8(BMW_H[14]);
  hash->h8[7] = SWAP8(BMW_H[15]);
- 
-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -449,94 +453,67 @@ __kernel void search2(__global hash_t* hashes)
				@@ -449,94 +453,67 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  #if !SPH_SMALL_FOOTPRINT_GROESTL
-    __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
-    __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
-  #else
-    __local sph_u64 T0_C[256], T4_C[256];
-  #endif
+  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];

  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_C[i] = T0[i];
-    T4_C[i] = T4[i];
-    #if !SPH_SMALL_FOOTPRINT_GROESTL
-      T1_C[i] = T1[i];
-      T2_C[i] = T2[i];
-      T3_C[i] = T3[i];
-      T5_C[i] = T5[i];
-      T6_C[i] = T6[i];
-      T7_C[i] = T7[i];
-    #endif
+    T0_L[i] = T0[i];
+    T4_L[i] = T4[i];
+    T1_L[i] = T1[i];
+    T2_L[i] = T2[i];
+    T3_L[i] = T3[i];
+    T5_L[i] = T5[i];
+    T6_L[i] = T6[i];
+    T7_L[i] = T7[i];
  }
 
-  barrier(CLK_LOCAL_MEM_FENCE); // groestl
-
-  #define T0 T0_C
-  #define T1 T1_C
-  #define T2 T2_C
-  #define T3 T3_C
-  #define T4 T4_C
-  #define T5 T5_C
-  #define T6 T6_C
-  #define T7 T7_C
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  #define T0 T0_L
+  #define T1 T1_L
+  #define T2 T2_L
+  #define T3 T3_L
+  #define T4 T4_L
+  #define T5 T5_L
+  #define T6 T6_L
+  #define T7 T7_L
 
  // groestl
-
-  sph_u64 H[16];
-  
-  for (unsigned int u = 0; u < 15; u ++)
-    H[u] = 0;
-      
-  #if USE_LE
-    H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
-  #else
-    H[15] = (sph_u64)512;
-  #endif
+  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};

  sph_u64 g[16], m[16];
-  m[0] = DEC64E(hash->h8[0]);
-  m[1] = DEC64E(hash->h8[1]);
-  m[2] = DEC64E(hash->h8[2]);
-  m[3] = DEC64E(hash->h8[3]);
-  m[4] = DEC64E(hash->h8[4]);
-  m[5] = DEC64E(hash->h8[5]);
-  m[6] = DEC64E(hash->h8[6]);
-  m[7] = DEC64E(hash->h8[7]);
-  
-  for (unsigned int u = 0; u < 16; u ++)
-    g[u] = m[u] ^ H[u];
-    
-  m[8] = 0x80; g[8] = m[8] ^ H[8];
-  m[9] = 0; g[9] = m[9] ^ H[9];
-  m[10] = 0; g[10] = m[10] ^ H[10];
-  m[11] = 0; g[11] = m[11] ^ H[11];
-  m[12] = 0; g[12] = m[12] ^ H[12];
-  m[13] = 0; g[13] = m[13] ^ H[13];
-  m[14] = 0; g[14] = m[14] ^ H[14];
-  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
+  g[0] = m[0] = DEC64E(hash->h8[0]);
+  g[1] = m[1] = DEC64E(hash->h8[1]);
+  g[2] = m[2] = DEC64E(hash->h8[2]);
+  g[3] = m[3] = DEC64E(hash->h8[3]);
+  g[4] = m[4] = DEC64E(hash->h8[4]);
+  g[5] = m[5] = DEC64E(hash->h8[5]);
+  g[6] = m[6] = DEC64E(hash->h8[6]);
+  g[7] = m[7] = DEC64E(hash->h8[7]);
+  g[8] = m[8] = 0x80;
+  g[9] = m[9] = 0;
+  g[10] = m[10] = 0;
+  g[11] = m[11] = 0;
+  g[12] = m[12] = 0;
+  g[13] = m[13] = 0;
+  g[14] = m[14] = 0;
+  g[15] = 0x102000000000000;
+  m[15] = 0x100000000000000;
  
  PERM_BIG_P(g);
  PERM_BIG_Q(m);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= g[u] ^ m[u];
-    
  sph_u64 xH[16];
-  
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u];
-    
+    xH[u] = H[u] ^= g[u] ^ m[u];
+      
  PERM_BIG_P(xH);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= xH[u];
-    
-  for (unsigned int u = 0; u < 8; u ++)
-    hash->h8[u] = DEC64E(H[u + 8]);
+  for (unsigned int u = 8; u < 16; u ++)
+    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);

  barrier(CLK_GLOBAL_MEM_FENCE); 
 }
@ -561,10 +538,14 @@ __kernel void search3(__global hash_t* hashes)
				@@ -561,10 +538,14 @@ __kernel void search3(__global hash_t* hashes)
  m5 = SWAP8(hash->h8[5]);
  m6 = SWAP8(hash->h8[6]);
  m7 = SWAP8(hash->h8[7]);
+  
  UBI_BIG(480, 64);
+  
  bcount = 0;
  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
+  
  UBI_BIG(510, 8);
+  
  hash->h8[0] = SWAP8(h0);
  hash->h8[1] = SWAP8(h1);
  hash->h8[2] = SWAP8(h2);
@ -574,7 +555,7 @@ __kernel void search3(__global hash_t* hashes)
				@@ -574,7 +555,7 @@ __kernel void search3(__global hash_t* hashes)
  hash->h8[6] = SWAP8(h6);
  hash->h8[7] = SWAP8(h7);

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -583,7 +564,7 @@ __kernel void search4(__global hash_t* hashes)
				@@ -583,7 +564,7 @@ __kernel void search4(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

- // jh
+  // jh

  sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7);
  sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b);
@ -612,7 +593,7 @@ __kernel void search4(__global hash_t* hashes)
				@@ -612,7 +593,7 @@ __kernel void search4(__global hash_t* hashes)
      h6l ^= DEC64E(hash->h8[5]);
      h7h ^= DEC64E(hash->h8[6]);
      h7l ^= DEC64E(hash->h8[7]);
-  
+
      h0h ^= 0x80;
      h3l ^= 0x2000000000000;
    }
@ -630,7 +611,7 @@ __kernel void search4(__global hash_t* hashes)
				@@ -630,7 +611,7 @@ __kernel void search4(__global hash_t* hashes)
  hash->h8[6] = DEC64E(h7h);
  hash->h8[7] = DEC64E(h7l);

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -642,7 +623,7 @@ __kernel void search5(__global hash_t* hashes)
				@@ -642,7 +623,7 @@ __kernel void search5(__global hash_t* hashes)
  // keccak

  sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0;
-  sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; 
+  sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0;
  sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0;
  sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0;
  sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0;
@ -664,6 +645,7 @@ __kernel void search5(__global hash_t* hashes)
				@@ -664,6 +645,7 @@ __kernel void search5(__global hash_t* hashes)
  a21 ^= SWAP8(hash->h8[7]);
  a31 ^= 0x8000000000000001;
  KECCAK_F_1600;
+  
  // Finalize the "lane complement"
  a10 = ~a10;
  a20 = ~a20;
@ -677,7 +659,7 @@ __kernel void search5(__global hash_t* hashes)
				@@ -677,7 +659,7 @@ __kernel void search5(__global hash_t* hashes)
  hash->h8[6] = SWAP8(a11);
  hash->h8[7] = SWAP8(a21);

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -740,7 +722,7 @@ __kernel void search6(__global hash_t* hashes)
				@@ -740,7 +722,7 @@ __kernel void search6(__global hash_t* hashes)
      hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47;
    }
  }
-
+  
  hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
  hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
  hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -750,7 +732,7 @@ __kernel void search6(__global hash_t* hashes)
				@@ -750,7 +732,7 @@ __kernel void search6(__global hash_t* hashes)
  hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46;
  hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47;

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -794,7 +776,7 @@ __kernel void search7(__global hash_t* hashes)
				@@ -794,7 +776,7 @@ __kernel void search7(__global hash_t* hashes)
      x6 ^= SWAP4(hash->h4[15]);
      x7 ^= SWAP4(hash->h4[14]);
    } 
-    else if(i == 1) 
+    else if(i == 1)
      x0 ^= 0x80;
    else if (i == 2) 
      xv ^= SPH_C32(1);
@ -817,7 +799,7 @@ __kernel void search7(__global hash_t* hashes)
				@@ -817,7 +799,7 @@ __kernel void search7(__global hash_t* hashes)
  hash->h4[14] = xe;
  hash->h4[15] = xf;

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -825,11 +807,12 @@ __kernel void search8(__global hash_t* hashes)
				@@ -825,11 +807,12 @@ __kernel void search8(__global hash_t* hashes)
 {
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  
  __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
-
+  
  int init = get_local_id(0);
  int step = get_local_size(0);
-
+  
  for (int i = init; i < 256; i += step)
  {
    AES0[i] = AES0_C[i];
@ -837,7 +820,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -837,7 +820,7 @@ __kernel void search8(__global hash_t* hashes)
    AES2[i] = AES2_C[i];
    AES3[i] = AES3_C[i];
  }
-
+  
  barrier(CLK_LOCAL_MEM_FENCE);

  // shavite
@ -853,7 +836,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -853,7 +836,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
@ -896,7 +879,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -896,7 +879,7 @@ __kernel void search8(__global hash_t* hashes)
  hash->h4[14] = hE;
  hash->h4[15] = hF;

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -908,10 +891,8 @@ __kernel void search9(__global hash_t* hashes)
				@@ -908,10 +891,8 @@ __kernel void search9(__global hash_t* hashes)
  // simd
  s32 q[256];
  unsigned char x[128];
-  
  for(unsigned int i = 0; i < 64; i++)
    x[i] = hash->h1[i];
-    
  for(unsigned int i = 64; i < 128; i++)
    x[i] = 0;

@ -921,14 +902,15 @@ __kernel void search9(__global hash_t* hashes)
				@@ -921,14 +902,15 @@ __kernel void search9(__global hash_t* hashes)
  u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);

  FFT256(0, 1, 0, ll1);
-  for (int i = 0; i < 256; i ++) {
-      s32 tq;
-
-      tq = q[i] + yoff_b_n[i];
-      tq = REDS2(tq);
-      tq = REDS1(tq);
-      tq = REDS1(tq);
-      q[i] = (tq <= 128 ? tq : tq - 257);
+  for (int i = 0; i < 256; i ++) 
+  {
+    s32 tq;
+
+    tq = q[i] + yoff_b_n[i];
+    tq = REDS2(tq);
+    tq = REDS1(tq);
+    tq = REDS1(tq);
+    q[i] = (tq <= 128 ? tq : tq - 257);
  }

  A0 ^= hash->h4[0];
@ -954,21 +936,24 @@ __kernel void search9(__global hash_t* hashes)
				@@ -954,21 +936,24 @@ __kernel void search9(__global hash_t* hashes)
  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);

  STEP_BIG(
-      C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
-      C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
-      IF,  4, 13, PP8_4_);
+    C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+    C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+    IF,  4, 13, PP8_4_);
+    
  STEP_BIG(
-      C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
-      C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
-      IF, 13, 10, PP8_5_);
+    C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+    C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+    IF, 13, 10, PP8_5_);
+    
  STEP_BIG(
-      C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
-      C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
-      IF, 10, 25, PP8_6_);
+    C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+    C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+    IF, 10, 25, PP8_6_);
+    
  STEP_BIG(
-      C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
-      C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
-      IF, 25,  4, PP8_0_);
+    C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+    C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
+    IF, 25,  4, PP8_0_);

  u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
  u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
@ -983,22 +968,27 @@ __kernel void search9(__global hash_t* hashes)
				@@ -983,22 +968,27 @@ __kernel void search9(__global hash_t* hashes)
  ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
  ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+  
  STEP_BIG(
-      COPY_A0, COPY_A1, COPY_A2, COPY_A3,
-      COPY_A4, COPY_A5, COPY_A6, COPY_A7,
-      IF,  4, 13, PP8_4_);
+    COPY_A0, COPY_A1, COPY_A2, COPY_A3,
+    COPY_A4, COPY_A5, COPY_A6, COPY_A7,
+    IF,  4, 13, PP8_4_);
+    
  STEP_BIG(
-      COPY_B0, COPY_B1, COPY_B2, COPY_B3,
-      COPY_B4, COPY_B5, COPY_B6, COPY_B7,
-      IF, 13, 10, PP8_5_);
+    COPY_B0, COPY_B1, COPY_B2, COPY_B3,
+    COPY_B4, COPY_B5, COPY_B6, COPY_B7,
+    IF, 13, 10, PP8_5_);
+    
  STEP_BIG(
-      COPY_C0, COPY_C1, COPY_C2, COPY_C3,
-      COPY_C4, COPY_C5, COPY_C6, COPY_C7,
-      IF, 10, 25, PP8_6_);
+    COPY_C0, COPY_C1, COPY_C2, COPY_C3,
+    COPY_C4, COPY_C5, COPY_C6, COPY_C7,
+    IF, 10, 25, PP8_6_);
+    
  STEP_BIG(
-      COPY_D0, COPY_D1, COPY_D2, COPY_D3,
-      COPY_D4, COPY_D5, COPY_D6, COPY_D7,
-      IF, 25,  4, PP8_0_);
+    COPY_D0, COPY_D1, COPY_D2, COPY_D3,
+    COPY_D4, COPY_D5, COPY_D6, COPY_D7,
+    IF, 25,  4, PP8_0_);
+    
  #undef q

  hash->h4[0] = A0;
@ -1018,7 +1008,7 @@ __kernel void search9(__global hash_t* hashes)
				@@ -1018,7 +1008,7 @@ __kernel void search9(__global hash_t* hashes)
  hash->h4[14] = B6;
  hash->h4[15] = B7;

-  barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -1026,8 +1016,7 @@ __kernel void search10(__global hash_t* hashes)
				@@ -1026,8 +1016,7 @@ __kernel void search10(__global hash_t* hashes)
 {
  uint gid = get_global_id(0);
  uint offset = get_global_offset(0);
-  hash_t hash;
-  __global hash_t *hashp = &(hashes[gid-offset]);
+  __global hash_t *hash = &(hashes[gid-offset]);

  __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];

@ -1045,7 +1034,7 @@ __kernel void search10(__global hash_t* hashes)
				@@ -1045,7 +1034,7 @@ __kernel void search10(__global hash_t* hashes)
  barrier(CLK_LOCAL_MEM_FENCE);

  for (int i = 0; i < 8; i++)
-    hash.h8[i] = hashes[gid-offset].h8[i];
+    hash->h8[i] = hashes[gid-offset].h8[i];

  // echo
  sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
@ -1074,14 +1063,14 @@ __kernel void search10(__global hash_t* hashes)
				@@ -1074,14 +1063,14 @@ __kernel void search10(__global hash_t* hashes)
  W61 = Vb61;
  W70 = Vb70;
  W71 = Vb71;
-  W80 = hash.h8[0];
-  W81 = hash.h8[1];
-  W90 = hash.h8[2];
-  W91 = hash.h8[3];
-  WA0 = hash.h8[4];
-  WA1 = hash.h8[5];
-  WB0 = hash.h8[6];
-  WB1 = hash.h8[7];
+  W80 = hash->h8[0];
+  W81 = hash->h8[1];
+  W90 = hash->h8[2];
+  W91 = hash->h8[3];
+  WA0 = hash->h8[4];
+  WA1 = hash->h8[5];
+  WB0 = hash->h8[6];
+  WB1 = hash->h8[7];
  WC0 = 0x80;
  WC1 = 0;
  WD0 = 0;
@ -1094,14 +1083,14 @@ __kernel void search10(__global hash_t* hashes)
				@@ -1094,14 +1083,14 @@ __kernel void search10(__global hash_t* hashes)
  for (unsigned u = 0; u < 10; u ++)
    BIG_ROUND;

-  hashp->h8[0] = hash.h8[0] ^ Vb00 ^ W00 ^ W80;
-  hashp->h8[1] = hash.h8[1] ^ Vb01 ^ W01 ^ W81;
-  hashp->h8[2] = hash.h8[2] ^ Vb10 ^ W10 ^ W90;
-  hashp->h8[3] = hash.h8[3] ^ Vb11 ^ W11 ^ W91;
-  hashp->h8[4] = hash.h8[4] ^ Vb20 ^ W20 ^ WA0;
-  hashp->h8[5] = hash.h8[5] ^ Vb21 ^ W21 ^ WA1;
-  hashp->h8[6] = hash.h8[6] ^ Vb30 ^ W30 ^ WB0;
-  hashp->h8[7] = hash.h8[7] ^ Vb31 ^ W31 ^ WB1;
+  hash->h8[0] ^= Vb00 ^ W00 ^ W80;
+  hash->h8[1] ^= Vb01 ^ W01 ^ W81;
+  hash->h8[2] ^= Vb10 ^ W10 ^ W90;
+  hash->h8[3] ^= Vb11 ^ W11 ^ W91;
+  hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
+  hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
+  hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
+  hash->h8[7] ^= Vb31 ^ W31 ^ WB1;

  barrier(CLK_GLOBAL_MEM_FENCE); 
 }
@ -1110,9 +1099,22 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
				@@ -1110,9 +1099,22 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search11(__global hash_t* hashes)
 {
  uint gid = get_global_id(0);
-  uint offset = get_global_offset(0);
-  __global hash_t *hash = &(hashes[gid-offset]);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  
+  #ifdef INPUT_BIG_LOCAL
+    __local sph_u32 T512_L[1024];
+    __constant const sph_u32 *T512_C = &T512[0][0];
+    
+    int init = get_local_id(0);
+    int step = get_local_size(0);
+    for (int i = init; i < 1024; i += step)
+      T512_L[i] = T512_C[i];

+    barrier(CLK_LOCAL_MEM_FENCE);
+  #else
+    #define INPUT_BIG_LOCAL INPUT_BIG
+  #endif
+  
  sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
  sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
  sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
@ -1122,28 +1124,31 @@ __kernel void search11(__global hash_t* hashes)
				@@ -1122,28 +1124,31 @@ __kernel void search11(__global hash_t* hashes)
  sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };

  #define buf(u) hash->h1[i + u]
+  
  for(int i = 0; i < 64; i += 8) 
  {
-    INPUT_BIG;
+    INPUT_BIG_LOCAL;
    P_BIG;
    T_BIG;
  }
-  #undef buf
  
+  #undef buf
  #define buf(u) (u == 0 ? 0x80 : 0)
-  INPUT_BIG;
+  
+  INPUT_BIG_LOCAL;
  P_BIG;
  T_BIG;
-  #undef buf
  
+  #undef buf
  #define buf(u) (u == 6 ? 2 : 0)
-  INPUT_BIG;
+  
+  INPUT_BIG_LOCAL;
  PF_BIG;
  T_BIG;

-  for(unsigned u = 0; u < 16; u ++)
-    hash->h4[u] = h[u];
-
+  for (unsigned u = 0; u < 16; u ++)
+      hash->h4[u] = h[u];
+      
  barrier(CLK_GLOBAL_MEM_FENCE); 
 }

@ -1153,14 +1158,27 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
				@@ -1153,14 +1158,27 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
  uint gid = get_global_id(0);
  uint offset = get_global_offset(0);
  __global hash_t *hash = &(hashes[gid-offset]);
-  
+
+  //mixtab
+  __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
+  int init = get_local_id(0);
+  int step = get_local_size(0);
+  for (int i = init; i < 256; i += step)
+  {
+    mixtab0[i] = mixtab0_c[i];
+    mixtab1[i] = mixtab1_c[i];
+    mixtab2[i] = mixtab2_c[i];
+    mixtab3[i] = mixtab3_c[i];
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE); 
+    
  // fugue
  sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
  sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
  sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
  sph_u32 S30, S31, S32, S33, S34, S35;
  
-  ulong fc_bit_count = (sph_u64) 64 << 3;
+  ulong fc_bit_count = (sph_u64) 0x200;
  
  S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
  S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
--- a/kernel/marucoin-modold.cl
+++ b/kernel/marucoin-modold.cl
@ -30,8 +30,8 @@
				@@ -30,8 +30,8 @@
 * @author   phm <phm@inbox.com>
 */

-#ifndef X13MOD_CL
-#define X13MOD_CL
+#ifndef MARUCOIN_MOD_CL
+#define MARUCOIN_MOD_CL

 #if __ENDIAN_LITTLE__
 #define SPH_LITTLE_ENDIAN 1
@ -55,12 +55,12 @@ typedef long sph_s64;
				@@ -55,12 +55,12 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#define SPH_T64(x) (as_ulong(x))
 #define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

@ -74,9 +74,9 @@ typedef long sph_s64;
				@@ -74,9 +74,9 @@ typedef long sph_s64;
 #define SPH_SMALL_FOOTPRINT_GROESTL 0
 #define SPH_GROESTL_BIG_ENDIAN 0
 #define SPH_CUBEHASH_UNROLL 0
-#define SPH_KECCAK_UNROLL   0
-#if !defined SPH_HAMSI_EXPAND_BIG
-#define SPH_HAMSI_EXPAND_BIG 4
+#define SPH_KECCAK_UNROLL   1
+#ifndef SPH_HAMSI_EXPAND_BIG
+  #define SPH_HAMSI_EXPAND_BIG 4
 #endif

 #include "blake.cl"
@ -794,6 +794,31 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
				@@ -794,6 +794,31 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
    }

    barrier(CLK_LOCAL_MEM_FENCE);
+	
+    #ifdef INPUT_BIG_LOCAL
+      __local sph_u32 T512_L[1024];
+      __constant const sph_u32 *T512_C = &T512[0][0];
+      
+      for (int i = init; i < 1024; i += step)
+        T512_L[i] = T512_C[i];
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #else
+      #define INPUT_BIG_LOCAL INPUT_BIG
+    #endif
+    
+  
+    // mixtab
+    __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
+    for (int i = init; i < 256; i += step)
+    {
+    	mixtab0[i] = mixtab0_c[i];
+    	mixtab1[i] = mixtab1_c[i];
+    	mixtab2[i] = mixtab2_c[i];
+    	mixtab3[i] = mixtab3_c[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+	

    for (int i = 0; i < 8; i++) {
        hash.h8[i] = hashes[gid-offset].h8[i];
@ -875,18 +900,18 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
				@@ -875,18 +900,18 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo

 #define buf(u) hash.h1[i + u]
    for(int i = 0; i < 64; i += 8) {
-        INPUT_BIG;
+        INPUT_BIG_LOCAL;
        P_BIG;
        T_BIG;
    }
 #undef buf
 #define buf(u) (u == 0 ? 0x80 : 0)
-    INPUT_BIG;
+    INPUT_BIG_LOCAL;
    P_BIG;
    T_BIG;
 #undef buf
 #define buf(u) (u == 6 ? 2 : 0)
-    INPUT_BIG;
+    INPUT_BIG_LOCAL;
    PF_BIG;
    T_BIG;

@ -976,7 +1001,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
				@@ -976,7 +1001,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
    hash.h4[15] = SWAP4(S30);

    }
-
+	
    bool result = (hash.h8[3] <= target);
    if (result)
      output[atomic_inc(output+0xFF)] = SWAP4(gid);
@ -984,4 +1009,4 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
				@@ -984,4 +1009,4 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
    barrier(CLK_GLOBAL_MEM_FENCE); 
 }

-#endif // X13MOD_CL
+#endif // MARUCOIN_MOD_CL
--- a/kernel/shabal.cl
+++ b/kernel/shabal.cl
@ -0,0 +1,350 @@
				@@ -0,0 +1,350 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+__constant static const sph_u32 A_init_192[] = {
+	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+__constant static const sph_u32 B_init_192[] = {
+	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+__constant static const sph_u32 C_init_192[] = {
+	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+__constant static const sph_u32 A_init_224[] = {
+	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+__constant static const sph_u32 B_init_224[] = {
+	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+__constant static const sph_u32 C_init_224[] = {
+	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+__constant static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+__constant static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+__constant static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+__constant static const sph_u32 A_init_384[] = {
+	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+__constant static const sph_u32 B_init_384[] = {
+	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+__constant static const sph_u32 C_init_384[] = {
+	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+
+__constant static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__constant static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__constant static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/* END -- automatically generated code. */
--- a/kernel/whirlpool.cl
+++ b/kernel/whirlpool.cl
--- a/kernel/x14.cl
+++ b/kernel/x14.cl
--- a/kernel/x14old.cl
+++ b/kernel/x14old.cl
--- a/miner.h
+++ b/miner.h
@ -1033,6 +1033,7 @@ extern int swork_id;
				@@ -1033,6 +1033,7 @@ extern int swork_id;
 extern int opt_tcp_keepalive;
 extern bool opt_incognito;
 extern int opt_hamsi_expand_big;
+extern bool opt_hamsi_short;

 #if LOCK_TRACKING
 extern pthread_mutex_t lockstat_lock;
--- a/sgminer.c
+++ b/sgminer.c
@ -192,6 +192,7 @@ int nDevs;
				@@ -192,6 +192,7 @@ int nDevs;
 int opt_dynamic_interval = 7;
 int opt_g_threads = -1;
 int opt_hamsi_expand_big = 4;
+bool opt_hamsi_short = false;
 bool opt_restart = true;

 struct list_head scan_devices;
@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
				@@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
      "Set GPU lookup gap for scrypt mining, comma separated"),
  OPT_WITH_ARG("--hamsi-expand-big",
      set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big,
-      "Set SPH_HAMSI_EXPAND_BIG for X13 algorithms (1 or 4 are common)"),
+      "Set SPH_HAMSI_EXPAND_BIG for X13 derived algorithms (1 or 4 are common)"),
+  OPT_WITHOUT_ARG("--hamsi-short",
+      opt_set_bool, &opt_hamsi_short,
+      "Set SPH_HAMSI_SHORT for X13 derived algorithms (Can give better hashrate for some GPUs)"),
 #ifdef HAVE_CURSES
  OPT_WITHOUT_ARG("--incognito",
      opt_set_bool, &opt_incognito,
--- a/sph/Makefile.am
+++ b/sph/Makefile.am
@ -1,3 +1,3 @@
				@@ -1,3 +1,3 @@
 noinst_LIBRARIES	= libsph.a

-libsph_a_SOURCES	= bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c
+libsph_a_SOURCES	= bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c
--- a/sph/shabal.c
+++ b/sph/shabal.c
@ -0,0 +1,806 @@
				@@ -0,0 +1,806 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shabal.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	sph_u32 A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	sph_u32 B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	sph_u32 C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK   do { \
+		M0 = sph_dec32le_aligned(buf + 0); \
+		M1 = sph_dec32le_aligned(buf + 4); \
+		M2 = sph_dec32le_aligned(buf + 8); \
+		M3 = sph_dec32le_aligned(buf + 12); \
+		M4 = sph_dec32le_aligned(buf + 16); \
+		M5 = sph_dec32le_aligned(buf + 20); \
+		M6 = sph_dec32le_aligned(buf + 24); \
+		M7 = sph_dec32le_aligned(buf + 28); \
+		M8 = sph_dec32le_aligned(buf + 32); \
+		M9 = sph_dec32le_aligned(buf + 36); \
+		MA = sph_dec32le_aligned(buf + 40); \
+		MB = sph_dec32le_aligned(buf + 44); \
+		MC = sph_dec32le_aligned(buf + 48); \
+		MD = sph_dec32le_aligned(buf + 52); \
+		ME = sph_dec32le_aligned(buf + 56); \
+		MF = sph_dec32le_aligned(buf + 60); \
+	} while (0)
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_192[] = {
+	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+static const sph_u32 B_init_192[] = {
+	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+static const sph_u32 C_init_192[] = {
+	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+static const sph_u32 A_init_224[] = {
+	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+static const sph_u32 B_init_224[] = {
+	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+static const sph_u32 C_init_224[] = {
+	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_384[] = {
+	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+static const sph_u32 B_init_384[] = {
+	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+static const sph_u32 C_init_384[] = {
+	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/* END -- automatically generated code. */
+
+static void
+shabal_init(void *cc, unsigned size)
+{
+	/*
+	 * We have precomputed initial states for all the supported
+	 * output bit lengths.
+	 */
+	const sph_u32 *A_init, *B_init, *C_init;
+	sph_shabal_context *sc;
+
+	switch (size) {
+	case 192:
+		A_init = A_init_192;
+		B_init = B_init_192;
+		C_init = C_init_192;
+		break;
+	case 224:
+		A_init = A_init_224;
+		B_init = B_init_224;
+		C_init = C_init_224;
+		break;
+	case 256:
+		A_init = A_init_256;
+		B_init = B_init_256;
+		C_init = C_init_256;
+		break;
+	case 384:
+		A_init = A_init_384;
+		B_init = B_init_384;
+		C_init = C_init_384;
+		break;
+	case 512:
+		A_init = A_init_512;
+		B_init = B_init_512;
+		C_init = C_init_512;
+		break;
+	default:
+		return;
+	}
+	sc = (sph_shabal_context *)cc;
+	memcpy(sc->A, A_init, sizeof sc->A);
+	memcpy(sc->B, B_init, sizeof sc->B);
+	memcpy(sc->C, C_init, sizeof sc->C);
+	sc->Wlow = 1;
+	sc->Whigh = 0;
+	sc->ptr = 0;
+}
+
+static void
+shabal_core(void *cc, const unsigned char *data, size_t len)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	sc = (sph_shabal_context *)cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * We do not want to copy the state to local variables if the
+	 * amount of data is less than what is needed to complete the
+	 * current block. Note that it is anyway suboptimal to call
+	 * this method many times for small chunks of data.
+	 */
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			DECODE_BLOCK;
+			INPUT_BLOCK_ADD;
+			XOR_W;
+			APPLY_P;
+			INPUT_BLOCK_SUB;
+			SWAP_BC;
+			INCR_W;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+shabal_close(void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	int i;
+	unsigned z;
+	union {
+		unsigned char tmp_out[64];
+		sph_u32 dummy;
+	} u;
+	size_t out_len;
+	DECL_STATE
+
+	sc = (sph_shabal_context *)cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr + 1, 0, (sizeof sc->buf) - (ptr + 1));
+	READ_STATE(sc);
+	DECODE_BLOCK;
+	INPUT_BLOCK_ADD;
+	XOR_W;
+	APPLY_P;
+	for (i = 0; i < 3; i ++) {
+		SWAP_BC;
+		XOR_W;
+		APPLY_P;
+	}
+
+	/*
+	 * We just use our local variables; no need to go through
+	 * the state structure. In order to share some code, we
+	 * emit the relevant words into a temporary buffer, which
+	 * we finally copy into the destination array.
+	 */
+	switch (size_words) {
+	case 16:
+		sph_enc32le_aligned(u.tmp_out +  0, B0);
+		sph_enc32le_aligned(u.tmp_out +  4, B1);
+		sph_enc32le_aligned(u.tmp_out +  8, B2);
+		sph_enc32le_aligned(u.tmp_out + 12, B3);
+		/* fall through */
+	case 12:
+		sph_enc32le_aligned(u.tmp_out + 16, B4);
+		sph_enc32le_aligned(u.tmp_out + 20, B5);
+		sph_enc32le_aligned(u.tmp_out + 24, B6);
+		sph_enc32le_aligned(u.tmp_out + 28, B7);
+		/* fall through */
+	case 8:
+		sph_enc32le_aligned(u.tmp_out + 32, B8);
+		/* fall through */
+	case 7:
+		sph_enc32le_aligned(u.tmp_out + 36, B9);
+		/* fall through */
+	case 6:
+		sph_enc32le_aligned(u.tmp_out + 40, BA);
+		sph_enc32le_aligned(u.tmp_out + 44, BB);
+		sph_enc32le_aligned(u.tmp_out + 48, BC);
+		sph_enc32le_aligned(u.tmp_out + 52, BD);
+		sph_enc32le_aligned(u.tmp_out + 56, BE);
+		sph_enc32le_aligned(u.tmp_out + 60, BF);
+		break;
+	default:
+		return;
+	}
+	out_len = size_words << 2;
+	memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len);
+	shabal_init(sc, size_words << 5);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_init(void *cc)
+{
+	shabal_init(cc, 192);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, (const unsigned char*)data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_init(void *cc)
+{
+	shabal_init(cc, 224);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, (const unsigned char*)data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 7);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_init(void *cc)
+{
+	shabal_init(cc, 256);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, (const unsigned char*)data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 8);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_init(void *cc)
+{
+	shabal_init(cc, 384);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, (const unsigned char*)data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 12);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_init(void *cc)
+{
+	shabal_init(cc, 512);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, (const unsigned char*)data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/sph/sph_shabal.h
+++ b/sph/sph_shabal.h
@ -0,0 +1,344 @@
				@@ -0,0 +1,344 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHABAL_H__
+#define SPH_SHABAL_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for Shabal-192.
+ */
+#define SPH_SIZE_shabal192   192
+
+/**
+ * Output size (in bits) for Shabal-224.
+ */
+#define SPH_SIZE_shabal224   224
+
+/**
+ * Output size (in bits) for Shabal-256.
+ */
+#define SPH_SIZE_shabal256   256
+
+/**
+ * Output size (in bits) for Shabal-384.
+ */
+#define SPH_SIZE_shabal384   384
+
+/**
+ * Output size (in bits) for Shabal-512.
+ */
+#define SPH_SIZE_shabal512   512
+
+/**
+ * This structure is a context for Shabal computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a Shabal computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Shabal computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+#endif
+} sph_shabal_context;
+
+/**
+ * Type for a Shabal-192 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal192_context;
+
+/**
+ * Type for a Shabal-224 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal224_context;
+
+/**
+ * Type for a Shabal-256 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal256_context;
+
+/**
+ * Type for a Shabal-384 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal384_context;
+
+/**
+ * Type for a Shabal-512 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal512_context;
+
+/**
+ * Initialize a Shabal-192 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-192 context (pointer to a
+ *             <code>sph_shabal192_context</code>)
+ */
+void sph_shabal192_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-192 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal192(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-192 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (24 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-224 context (pointer to a
+ *             <code>sph_shabal224_context</code>)
+ */
+void sph_shabal224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-256 context (pointer to a
+ *             <code>sph_shabal256_context</code>)
+ */
+void sph_shabal256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-384 context (pointer to a
+ *             <code>sph_shabal384_context</code>)
+ */
+void sph_shabal384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-512 context (pointer to a
+ *             <code>sph_shabal512_context</code>)
+ */
+void sph_shabal512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
--- a/sph/sph_whirlpool.h
+++ b/sph/sph_whirlpool.h
@ -0,0 +1,218 @@
				@@ -0,0 +1,218 @@
+/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * WHIRLPOOL interface.
+ *
+ * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
+ * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
+ * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
+ * version, 2003, with a new diffusion matrix, also described as "plain
+ * WHIRLPOOL"). All three variants are implemented here.
+ *
+ * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
+ * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
+ * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
+ *
+ * The current WHIRLPOOL specification and a reference implementation
+ * can be found on the WHIRLPOOL web page:
+ * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_whirlpool.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_WHIRLPOOL_H__
+#define SPH_WHIRLPOOL_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for WHIRLPOOL.
+ */
+#define SPH_SIZE_whirlpool   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-0.
+ */
+#define SPH_SIZE_whirlpool0   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-1.
+ */
+#define SPH_SIZE_whirlpool1   512
+
+/**
+ * This structure is a context for WHIRLPOOL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a WHIRLPOOL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running WHIRLPOOL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u64 state[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_whirlpool_context;
+
+/**
+ * Initialize a WHIRLPOOL context. This process performs no memory allocation.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool_context</code>)
+ */
+void sph_whirlpool_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * plain WHIRLPOOL algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool0_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-0 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool0_context</code>)
+ */
+void sph_whirlpool0_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool0_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-0 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool0(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-0 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-0 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool0_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool1_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-1 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool1_context</code>)
+ */
+void sph_whirlpool1_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool1_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-1 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool1(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-1 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-1 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool1_close(void *cc, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/sph/whirlpool.c
+++ b/sph/whirlpool.c
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@ -28,23 +28,27 @@
				@@ -28,23 +28,27 @@
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
@ -257,7 +261,9 @@
				@@ -257,7 +261,9 @@
    <ClCompile Include="..\adl.c" />
    <ClCompile Include="..\algorithm.c" />
    <ClCompile Include="..\algorithm\animecoin.c" />
+    <ClCompile Include="..\algorithm\bitblock.c" />
    <ClCompile Include="..\algorithm\talkcoin.c" />
+    <ClCompile Include="..\algorithm\x14.c" />
    <ClCompile Include="..\api.c" />
    <ClCompile Include="..\ccan\opt\helpers.c" />
    <ClCompile Include="..\ccan\opt\opt.c" />
@ -310,17 +316,21 @@
				@@ -310,17 +316,21 @@
    <ClCompile Include="..\sph\panama.c" />
    <ClCompile Include="..\sph\sha2.c" />
    <ClCompile Include="..\sph\sha2big.c" />
+    <ClCompile Include="..\sph\shabal.c" />
    <ClCompile Include="..\sph\shavite.c" />
    <ClCompile Include="..\sph\simd.c" />
    <ClCompile Include="..\sph\skein.c" />
    <ClCompile Include="..\algorithm\twecoin.c" />
+    <ClCompile Include="..\sph\whirlpool.c" />
    <ClCompile Include="..\util.c" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\adl.h" />
    <ClInclude Include="..\algorithm.h" />
    <ClInclude Include="..\algorithm\animecoin.h" />
+    <ClInclude Include="..\algorithm\bitblock.h" />
    <ClInclude Include="..\algorithm\talkcoin.h" />
+    <ClInclude Include="..\algorithm\x14.h" />
    <ClInclude Include="..\api.h" />
    <ClInclude Include="..\arg-nonnull.h" />
    <ClInclude Include="..\bench_block.h" />
@ -365,11 +375,13 @@
				@@ -365,11 +375,13 @@
    <ClInclude Include="..\sph\sph_luffa.h" />
    <ClInclude Include="..\sph\sph_panama.h" />
    <ClInclude Include="..\sph\sph_sha2.h" />
+    <ClInclude Include="..\sph\sph_shabal.h" />
    <ClInclude Include="..\sph\sph_shavite.h" />
    <ClInclude Include="..\sph\sph_simd.h" />
    <ClInclude Include="..\sph\sph_skein.h" />
    <ClInclude Include="..\sph\sph_types.h" />
    <ClInclude Include="..\algorithm\twecoin.h" />
+    <ClInclude Include="..\sph\sph_whirlpool.h" />
    <ClInclude Include="..\uthash.h" />
    <ClInclude Include="..\util.h" />
    <ClInclude Include="..\warn-on-use.h" />
@ -382,4 +394,4 @@
				@@ -382,4 +394,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
@ -218,6 +218,18 @@
				@@ -218,6 +218,18 @@
    <ClCompile Include="..\algorithm\talkcoin.c">
      <Filter>Source Files\algorithm</Filter>
    </ClCompile>
+    <ClCompile Include="..\sph\shabal.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\sph\whirlpool.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\algorithm\bitblock.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
+    <ClCompile Include="..\algorithm\x14.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\adl.h">
@ -394,8 +406,20 @@
				@@ -394,8 +406,20 @@
    <ClInclude Include="..\algorithm\talkcoin.h">
      <Filter>Header Files\algorithm</Filter>
    </ClInclude>
+    <ClInclude Include="..\sph\sph_shabal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\sph\sph_whirlpool.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\algorithm\bitblock.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\algorithm\x14.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="README.txt" />
  </ItemGroup>
-</Project>
+</Project>