X14 Implementation

Added X14 and cleaned up the X13/X15 kernels so all 3 offshoots are in sync. New option "--hamsi-short" or "hamsi-short":true to add a small boost. May not work on all GPUs.
11 years ago · 5c9126fd61
16 changed files with 6575 additions and 2890 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h
				@@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h
 sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h
 sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
 sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
+sgminer_SOURCES += algorithm/x14.c algorithm/x14.h

 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl

--- a/algorithm.c
+++ b/algorithm.c
@ -27,6 +27,7 @@
				@@ -27,6 +27,7 @@
 #include "algorithm/maxcoin.h"
 #include "algorithm/talkcoin.h"
 #include "algorithm/bitblock.h"
+#include "algorithm/x14.h"

 #include "compat.h"

@ -40,6 +41,8 @@ const char *algorithm_type_str[] = {
				@@ -40,6 +41,8 @@ const char *algorithm_type_str[] = {
  "NScrypt",
  "X11",
  "X13",
+  "X14",
+  "X15",
  "Keccak",
  "Quarkcoin",
  "Twecoin",
@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
				@@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
 static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
  char buf[255];
-  sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d",
-          opt_hamsi_expand_big);
+  sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d%s ",
+          opt_hamsi_expand_big, ((opt_hamsi_short)?" -D SPH_HAMSI_SHORT=1 ":""));
  strcat(data->compiler_options, buf);

-  sprintf(buf, "big%u", (unsigned int)opt_hamsi_expand_big);
+  sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":""));
  strcat(data->binary_filename, buf);
 }

@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
				@@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
  return status;
 }

+static cl_int queue_x14_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // echo - search10
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // hamsi - search11
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // fugue - search12
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shabal - search13
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
+static cl_int queue_x14_old_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  // bmw - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->padbuffer8);
+  // groestl - search2
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // skein - search3
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // jh - search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // keccak - search5
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // luffa - search6
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // cubehash - search7
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // shavite - search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // simd - search9
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // combined echo, hamsi, fugue - shabal - search10
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
 typedef struct _algorithm_settings_t {
  const char *name; /* Human-readable identifier */
  algorithm_type_t type; //common algorithm type
@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = {
				@@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = {
  { "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options},
  { "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options},

-  { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4,  8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
+  { "x14", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_hamsi_compiler_options},
+  { "x14old", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_hamsi_compiler_options},

+  { "bitblock", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_hamsi_compiler_options},
+  { "bitblockold", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_hamsi_compiler_options},
+
+  { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4,  8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
  // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
 #define A_FUGUE(a, b) \
    { a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL}
--- a/algorithm.h
+++ b/algorithm.h
@ -16,6 +16,8 @@ typedef enum {
				@@ -16,6 +16,8 @@ typedef enum {
  ALGO_NSCRYPT,
  ALGO_X11,
  ALGO_X13,
+  ALGO_X14,
+  ALGO_X15,
  ALGO_KECCAK,
  ALGO_QUARK,
  ALGO_TWE,
--- a/algorithm/x14.c
+++ b/algorithm/x14.c
@ -0,0 +1,247 @@
				@@ -0,0 +1,247 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+
+/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
+typedef struct {
+  sph_blake512_context    blake1;
+  sph_bmw512_context      bmw1;
+  sph_groestl512_context  groestl1;
+  sph_skein512_context    skein1;
+  sph_jh512_context       jh1;
+  sph_keccak512_context   keccak1;
+  sph_luffa512_context    luffa1;
+  sph_cubehash512_context cubehash1;
+  sph_shavite512_context  shavite1;
+  sph_simd512_context     simd1;
+  sph_echo512_context     echo1;
+  sph_hamsi512_context    hamsi1;
+  sph_fugue512_context    fugue1;
+  sph_shabal512_context	shabal1;
+} Xhash_context_holder;
+
+static Xhash_context_holder base_contexts;
+
+void init_X14hash_contexts()
+{
+  sph_blake512_init(&base_contexts.blake1);   
+  sph_bmw512_init(&base_contexts.bmw1);   
+  sph_groestl512_init(&base_contexts.groestl1);   
+  sph_skein512_init(&base_contexts.skein1);   
+  sph_jh512_init(&base_contexts.jh1);     
+  sph_keccak512_init(&base_contexts.keccak1); 
+  sph_luffa512_init(&base_contexts.luffa1);
+  sph_cubehash512_init(&base_contexts.cubehash1);
+  sph_shavite512_init(&base_contexts.shavite1);
+  sph_simd512_init(&base_contexts.simd1);
+  sph_echo512_init(&base_contexts.echo1);
+  sph_hamsi512_init(&base_contexts.hamsi1);
+  sph_fugue512_init(&base_contexts.fugue1);
+  sph_shabal512_init(&base_contexts.shabal1);
+}
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+  for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+
+inline void x14hash(void *state, const void *input)
+{
+  init_X14hash_contexts();
+  
+  Xhash_context_holder ctx;
+  
+  uint32_t hashA[16], hashB[16];  
+  //blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo
+  memcpy(&ctx, &base_contexts, sizeof(base_contexts));
+  
+  sph_blake512 (&ctx.blake1, input, 80);
+  sph_blake512_close (&ctx.blake1, hashA);        
+
+  sph_bmw512 (&ctx.bmw1, hashA, 64);    
+  sph_bmw512_close(&ctx.bmw1, hashB);     
+
+  sph_groestl512 (&ctx.groestl1, hashB, 64); 
+  sph_groestl512_close(&ctx.groestl1, hashA);
+ 
+  sph_skein512 (&ctx.skein1, hashA, 64); 
+  sph_skein512_close(&ctx.skein1, hashB); 
+ 
+  sph_jh512 (&ctx.jh1, hashB, 64); 
+  sph_jh512_close(&ctx.jh1, hashA);
+
+  sph_keccak512 (&ctx.keccak1, hashA, 64); 
+  sph_keccak512_close(&ctx.keccak1, hashB);
+  
+  sph_luffa512 (&ctx.luffa1, hashB, 64);
+  sph_luffa512_close (&ctx.luffa1, hashA);    
+      
+  sph_cubehash512 (&ctx.cubehash1, hashA, 64);   
+  sph_cubehash512_close(&ctx.cubehash1, hashB);  
+  
+  sph_shavite512 (&ctx.shavite1, hashB, 64);   
+  sph_shavite512_close(&ctx.shavite1, hashA);  
+  
+  sph_simd512 (&ctx.simd1, hashA, 64);   
+  sph_simd512_close(&ctx.simd1, hashB); 
+  
+  sph_echo512 (&ctx.echo1, hashB, 64);   
+  sph_echo512_close(&ctx.echo1, hashA);    
+
+  sph_hamsi512 (&ctx.hamsi1, hashA, 64);   
+  sph_hamsi512_close(&ctx.hamsi1, hashB);    
+
+  sph_fugue512 (&ctx.fugue1, hashB, 64);   
+  sph_fugue512_close(&ctx.fugue1, hashA);    
+
+  sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64);
+  sph_shabal512_close(&ctx.shabal1, hashB);
+
+  memcpy(state, hashB, 32);
+}
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+/* Used externally as confirmation of correct OCL code */
+int x14_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	x14hash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+        
+	if (tmp_hash7 > diff1targ)
+		return -1;
+    
+	if (tmp_hash7 > Htarg)
+		return 0;
+    
+	return 1;
+}
+
+void x14_regenhash(struct work *work)
+{
+  uint32_t data[20];
+  uint32_t *nonce = (uint32_t *)(work->data + 76);
+  uint32_t *ohash = (uint32_t *)(work->hash);
+
+  be32enc_vect(data, (const uint32_t *)work->data, 19);
+  data[19] = htobe32(*nonce);
+  x14hash(ohash, data);
+}
+
+static inline void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+bool scanhash_x14(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) 
+  {
+		uint32_t ostate[8];
+		*nonce = ++n;
+		data[19] = (n);
+		x14hash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
+
+		if(unlikely(tmp_hash7 <= Htarg)) 
+    {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) 
+    {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
--- a/algorithm/x14.h
+++ b/algorithm/x14.h
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+#ifndef X14_H
+#define X14_H
+
+#include "miner.h"
+
+extern int x14_test(unsigned char *pdata, const unsigned char *ptarget,
+			uint32_t nonce);
+extern void x14_regenhash(struct work *work);
+
+#endif /* X14_H */
--- a/kernel/bitblock.cl
+++ b/kernel/bitblock.cl
@ -72,13 +72,17 @@ typedef int sph_s32;
				@@ -72,13 +72,17 @@ typedef int sph_s32;
 #define SPH_SIMD_NOCOPY 0
 #define SPH_KECCAK_NOCOPY 0
 #define SPH_COMPACT_BLAKE_64 0
-#define SPH_LUFFA_PARALLEL 1
+#define SPH_LUFFA_PARALLEL 0
 #define SPH_SMALL_FOOTPRINT_GROESTL 0
 #define SPH_GROESTL_BIG_ENDIAN 0
 #define SPH_CUBEHASH_UNROLL 0
 #define SPH_KECCAK_UNROLL   1
+#ifndef SPH_HAMSI_EXPAND_BIG
 #define SPH_HAMSI_EXPAND_BIG 1
-#define SPH_HAMSI_SHORT 1
+#endif
+#ifndef SPH_HAMSI_SHORT
+  #define SPH_HAMSI_SHORT 1
+#endif

 #include "blake.cl"
 #include "bmw.cl"
@ -126,10 +130,10 @@ typedef union {
				@@ -126,10 +130,10 @@ typedef union {
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global unsigned char* block, __global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-  // blake
+    uint gid = get_global_id(0);
+    __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

+  // blake
  sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
  sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
  sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
  sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
  sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
  sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
+  
  M0 = DEC64BE(block + 0);
  M1 = DEC64BE(block + 8);
  M2 = DEC64BE(block + 16);
@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search1(__global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
+ uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

  // bmw
  sph_u64 BMW_H[16];
  
-  #pragma unroll 16    
+#pragma unroll 16  
  for(unsigned u = 0; u < 16; u++)
    BMW_H[u] = BMW_IV512[u];

  sph_u64 mv[16],q[32];
- 	sph_u64 tmp;
+	sph_u64 tmp;
  
  mv[0] = SWAP8(hash->h8[0]);
  mv[1] = SWAP8(hash->h8[1]);
@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes)
  mv[15] = SPH_C64(512);
  
  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
-  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
-  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
-  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes)
				@@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
  
-  #pragma unroll 2
+#pragma unroll 2
  for(int i=0;i<2;i++)
  {
    q[i+16] =
-    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
-    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
-    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
-    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3],  4) ^ SPH_ROTL64(q[i+3], 37)) +
-    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
-    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
-    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
-    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7],  4) ^ SPH_ROTL64(q[i+7], 37)) +
-    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
-    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
-    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
-    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11],  4) ^ SPH_ROTL64(q[i+11], 37)) +
-    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
-    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
-    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
-    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15],  4) ^ SPH_ROTL64(q[i+15], 37)) +
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+      (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+      (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+      (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+      (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+      (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+      (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+      (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+      (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+      (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+      (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+      (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+      (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+      (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+      (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+      (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 4
+#pragma unroll 4
  for(int i=2;i<6;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=6;i<9;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
  }

-  #pragma unroll 4
+#pragma unroll 4
  for(int i=9;i<13;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }

-  #pragma unroll 3
+#pragma unroll 3
  for(int i=13;i<16;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }

  sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);

-  #pragma unroll 16
+#pragma unroll 16
  for(int i=0;i<16;i++) 
  {
    mv[i] = BMW_H[i];
@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes)
  }

  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
-  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
-  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
-  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp,  4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes)
				@@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes)
  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
 
-  #pragma unroll 2
+#pragma unroll 2
  for(int i=0;i<2;i++)
  {
    q[i+16] =
-    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
-    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
-    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
-    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3],  4) ^ SPH_ROTL64(q[i+3], 37)) +
-    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
-    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
-    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
-    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7],  4) ^ SPH_ROTL64(q[i+7], 37)) +
-    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
-    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
-    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
-    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11],  4) ^ SPH_ROTL64(q[i+11], 37)) +
-    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
-    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
-    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
-    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15],  4) ^ SPH_ROTL64(q[i+15], 37)) +
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+      (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+      (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+      (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+      (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+      (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+      (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+      (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+      (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+      (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+      (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+      (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+      (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+      (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+      (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+      (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }

-  #pragma unroll 4
+#pragma unroll 4
  for(int i=2;i<6;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=6;i<9;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
  }
  
-  #pragma unroll 4
+#pragma unroll 4
  for(int i=9;i<13;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }
  
-  #pragma unroll 3
+#pragma unroll 3
  for(int i=13;i<16;i++) 
  {
    q[i+16] = CONST_EXP2 + 
-    ((    ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
-       SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+      (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+      SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
  }

  XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
-  XH64 =  XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+  XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+  
  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
				@@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  #if !SPH_SMALL_FOOTPRINT_GROESTL
-    __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
-    __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
-  #else
-    __local sph_u64 T0_C[256], T4_C[256];
-  #endif
+  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];

  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_C[i] = T0[i];
-    T4_C[i] = T4[i];
-    #if !SPH_SMALL_FOOTPRINT_GROESTL
-      T1_C[i] = T1[i];
-      T2_C[i] = T2[i];
-      T3_C[i] = T3[i];
-      T5_C[i] = T5[i];
-      T6_C[i] = T6[i];
-      T7_C[i] = T7[i];
-    #endif
+    T0_L[i] = T0[i];
+    T4_L[i] = T4[i];
+    T1_L[i] = T1[i];
+    T2_L[i] = T2[i];
+    T3_L[i] = T3[i];
+    T5_L[i] = T5[i];
+    T6_L[i] = T6[i];
+    T7_L[i] = T7[i];
  }
 
-  barrier(CLK_LOCAL_MEM_FENCE); // groestl
+  barrier(CLK_LOCAL_MEM_FENCE);

-  #define T0 T0_C
-  #define T1 T1_C
-  #define T2 T2_C
-  #define T3 T3_C
-  #define T4 T4_C
-  #define T5 T5_C
-  #define T6 T6_C
-  #define T7 T7_C
+  #define T0 T0_L
+  #define T1 T1_L
+  #define T2 T2_L
+  #define T3 T3_L
+  #define T4 T4_L
+  #define T5 T5_L
+  #define T6 T6_L
+  #define T7 T7_L
 
  // groestl
-
-  sph_u64 H[16];
-  
-  for (unsigned int u = 0; u < 15; u ++)
-    H[u] = 0;
-      
-  #if USE_LE
-    H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
-  #else
-    H[15] = (sph_u64)512;
-  #endif
+  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};

  sph_u64 g[16], m[16];
-  m[0] = DEC64E(hash->h8[0]);
-  m[1] = DEC64E(hash->h8[1]);
-  m[2] = DEC64E(hash->h8[2]);
-  m[3] = DEC64E(hash->h8[3]);
-  m[4] = DEC64E(hash->h8[4]);
-  m[5] = DEC64E(hash->h8[5]);
-  m[6] = DEC64E(hash->h8[6]);
-  m[7] = DEC64E(hash->h8[7]);
-  
-  for (unsigned int u = 0; u < 16; u ++)
-    g[u] = m[u] ^ H[u];
-    
-  m[8] = 0x80; g[8] = m[8] ^ H[8];
-  m[9] = 0; g[9] = m[9] ^ H[9];
-  m[10] = 0; g[10] = m[10] ^ H[10];
-  m[11] = 0; g[11] = m[11] ^ H[11];
-  m[12] = 0; g[12] = m[12] ^ H[12];
-  m[13] = 0; g[13] = m[13] ^ H[13];
-  m[14] = 0; g[14] = m[14] ^ H[14];
-  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
+  g[0] = m[0] = DEC64E(hash->h8[0]);
+  g[1] = m[1] = DEC64E(hash->h8[1]);
+  g[2] = m[2] = DEC64E(hash->h8[2]);
+  g[3] = m[3] = DEC64E(hash->h8[3]);
+  g[4] = m[4] = DEC64E(hash->h8[4]);
+  g[5] = m[5] = DEC64E(hash->h8[5]);
+  g[6] = m[6] = DEC64E(hash->h8[6]);
+  g[7] = m[7] = DEC64E(hash->h8[7]);
+  g[8] = m[8] = 0x80;
+  g[9] = m[9] = 0;
+  g[10] = m[10] = 0;
+  g[11] = m[11] = 0;
+  g[12] = m[12] = 0;
+  g[13] = m[13] = 0;
+  g[14] = m[14] = 0;
+  g[15] = 0x102000000000000;
+  m[15] = 0x100000000000000;
  
  PERM_BIG_P(g);
  PERM_BIG_Q(m);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= g[u] ^ m[u];
-    
  sph_u64 xH[16];
-  
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u];
+    xH[u] = H[u] ^= g[u] ^ m[u];
      
  PERM_BIG_P(xH);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= xH[u];
-    
-  for (unsigned int u = 0; u < 8; u ++)
-    hash->h8[u] = DEC64E(H[u + 8]);
+  for (unsigned int u = 8; u < 16; u ++)
+    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);

  barrier(CLK_GLOBAL_MEM_FENCE); 
 }
@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
				@@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
  m5 = SWAP8(hash->h8[5]);
  m6 = SWAP8(hash->h8[6]);
  m7 = SWAP8(hash->h8[7]);
+  
  UBI_BIG(480, 64);
+  
  bcount = 0;
  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
+  
  UBI_BIG(510, 8);
+  
  hash->h8[0] = SWAP8(h0);
  hash->h8[1] = SWAP8(h1);
  hash->h8[2] = SWAP8(h2);
@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes)
				@@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

- // jh
+  // jh

  sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7);
  sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b);
@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
				@@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
  a21 ^= SWAP8(hash->h8[7]);
  a31 ^= 0x8000000000000001;
  KECCAK_F_1600;
+  
  // Finalize the "lane complement"
  a10 = ~a10;
  a20 = ~a20;
@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
 {
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  
  __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
  
  int init = get_local_id(0);
@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
				@@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
  // simd
  s32 q[256];
  unsigned char x[128];
-  
  for(unsigned int i = 0; i < 64; i++)
    x[i] = hash->h1[i];
-    
  for(unsigned int i = 64; i < 128; i++)
    x[i] = 0;

@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes)
				@@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes)
  u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);

  FFT256(0, 1, 0, ll1);
-  for (int i = 0; i < 256; i ++) {
-      s32 tq;
-
-      tq = q[i] + yoff_b_n[i];
-      tq = REDS2(tq);
-      tq = REDS1(tq);
-      tq = REDS1(tq);
-      q[i] = (tq <= 128 ? tq : tq - 257);
+  for (int i = 0; i < 256; i ++) 
+  {
+    s32 tq;
+
+    tq = q[i] + yoff_b_n[i];
+    tq = REDS2(tq);
+    tq = REDS1(tq);
+    tq = REDS1(tq);
+    q[i] = (tq <= 128 ? tq : tq - 257);
  }

  A0 ^= hash->h4[0];
@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes)
				@@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes)
  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);

  STEP_BIG(
-      C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
-      C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
-      IF,  4, 13, PP8_4_);
+    C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+    C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+    IF,  4, 13, PP8_4_);
+    
  STEP_BIG(
-      C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
-      C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
-      IF, 13, 10, PP8_5_);
+    C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+    C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+    IF, 13, 10, PP8_5_);
+    
  STEP_BIG(
-      C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
-      C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
-      IF, 10, 25, PP8_6_);
+    C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+    C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+    IF, 10, 25, PP8_6_);
+    
  STEP_BIG(
-      C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
-      C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
-      IF, 25,  4, PP8_0_);
+    C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+    C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
+    IF, 25,  4, PP8_0_);

  u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
  u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
				@@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
  ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
  ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
  ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+  
  STEP_BIG(
-      COPY_A0, COPY_A1, COPY_A2, COPY_A3,
-      COPY_A4, COPY_A5, COPY_A6, COPY_A7,
-      IF,  4, 13, PP8_4_);
+    COPY_A0, COPY_A1, COPY_A2, COPY_A3,
+    COPY_A4, COPY_A5, COPY_A6, COPY_A7,
+    IF,  4, 13, PP8_4_);
+    
  STEP_BIG(
-      COPY_B0, COPY_B1, COPY_B2, COPY_B3,
-      COPY_B4, COPY_B5, COPY_B6, COPY_B7,
-      IF, 13, 10, PP8_5_);
+    COPY_B0, COPY_B1, COPY_B2, COPY_B3,
+    COPY_B4, COPY_B5, COPY_B6, COPY_B7,
+    IF, 13, 10, PP8_5_);
+    
  STEP_BIG(
-      COPY_C0, COPY_C1, COPY_C2, COPY_C3,
-      COPY_C4, COPY_C5, COPY_C6, COPY_C7,
-      IF, 10, 25, PP8_6_);
+    COPY_C0, COPY_C1, COPY_C2, COPY_C3,
+    COPY_C4, COPY_C5, COPY_C6, COPY_C7,
+    IF, 10, 25, PP8_6_);
+    
  STEP_BIG(
-      COPY_D0, COPY_D1, COPY_D2, COPY_D3,
-      COPY_D4, COPY_D5, COPY_D6, COPY_D7,
-      IF, 25,  4, PP8_0_);
+    COPY_D0, COPY_D1, COPY_D2, COPY_D3,
+    COPY_D4, COPY_D5, COPY_D6, COPY_D7,
+    IF, 25,  4, PP8_0_);
+    
  #undef q

  hash->h4[0] = A0;
@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes)
				@@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes)
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search11(__global hash_t* hashes)
 {
-    uint gid = get_global_id(0);
-    __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  uint gid = get_global_id(0);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  __local sph_u32 T512_L[1024];
+  __constant const sph_u32 *T512_C = &T512[0][0];
  
-    __local sph_u32 T512_L[1024];
-	__constant const sph_u32 *T512_C = &T512[0][0];
-    int init = get_local_id(0);
-    int step = get_local_size(0);
-    for (int i = init; i < 1024; i += step)
-    {
-		T512_L[i] = T512_C[i];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+  int init = get_local_id(0);
+  int step = get_local_size(0);
+  for (int i = init; i < 1024; i += step)
+    T512_L[i] = T512_C[i];

-    {
-        sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
-        sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
-        sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
-        sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15];
-        sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
-        sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
-        sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
-
-#define buf(u) hash->h1[i + u]
-        for(int i = 0; i < 64; i += 8) {
-            INPUT_BIG_LOCAL;
-            P_BIG;
-            T_BIG;
-        }
-#undef buf
-#define buf(u) (u == 0 ? 0x80 : 0)
-        INPUT_BIG_LOCAL;
-        P_BIG;
-        T_BIG;
-#undef buf
-#define buf(u) (u == 6 ? 2 : 0)
-        INPUT_BIG_LOCAL;
-        PF_BIG;
-        T_BIG;
-
-        for (unsigned u = 0; u < 16; u ++)
-            hash->h4[u] = h[u];
-    }
-    barrier(CLK_GLOBAL_MEM_FENCE); 
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
+  sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
+  sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
+  sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15];
+  sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+  sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+  sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+
+  #define buf(u) hash->h1[i + u]
+  
+  for(int i = 0; i < 64; i += 8) {
+      INPUT_BIG_LOCAL;
+      P_BIG;
+      T_BIG;
+  }
+  
+  #undef buf
+  #define buf(u) (u == 0 ? 0x80 : 0)
+  
+  INPUT_BIG_LOCAL;
+  P_BIG;
+  T_BIG;
+  
+  #undef buf
+  #define buf(u) (u == 6 ? 2 : 0)
+  
+  INPUT_BIG_LOCAL;
+  PF_BIG;
+  T_BIG;
+
+  for (unsigned u = 0; u < 16; u ++)
+      hash->h4[u] = h[u];
+      
+  barrier(CLK_GLOBAL_MEM_FENCE); 
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
				@@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
  uint offset = get_global_offset(0);
  __global hash_t *hash = &(hashes[gid-offset]);

-    __local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256];
+  __local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256];

-    int init = get_local_id(0);
-    int step = get_local_size(0);
+  int init = get_local_id(0);
+  int step = get_local_size(0);

-    for (int i = init; i < 256; i += step)
-    {
-        LT0[i] = plain_T0[i];
-        LT1[i] = plain_T1[i];
-        LT2[i] = plain_T2[i];
-        LT3[i] = plain_T3[i];
-        LT4[i] = plain_T4[i];
-        LT5[i] = plain_T5[i];
-        LT6[i] = plain_T6[i];
-        LT7[i] = plain_T7[i];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = init; i < 256; i += step)
+  {
+    LT0[i] = plain_T0[i];
+    LT1[i] = plain_T1[i];
+    LT2[i] = plain_T2[i];
+    LT3[i] = plain_T3[i];
+    LT4[i] = plain_T4[i];
+    LT5[i] = plain_T5[i];
+    LT6[i] = plain_T6[i];
+    LT7[i] = plain_T7[i];
+  }
  
+  barrier(CLK_LOCAL_MEM_FENCE);

-  // whirlpool
-    sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; 
-    sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
-    sph_u64 state[8];
-
-    n0 = (hash->h8[0]);
-    n1 = (hash->h8[1]);
-    n2 = (hash->h8[2]);
-    n3 = (hash->h8[3]);
-    n4 = (hash->h8[4]);
-    n5 = (hash->h8[5]);
-    n6 = (hash->h8[6]);
-    n7 = (hash->h8[7]);
-
-    h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;
-
-    n0 ^= h0;
-    n1 ^= h1;
-    n2 ^= h2;
-    n3 ^= h3;
-    n4 ^= h4;
-    n5 ^= h5;
-    n6 ^= h6;
-    n7 ^= h7;
-
-    for (unsigned r = 0; r < 10; r ++) {
-	sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-	ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
-	TRANSFER(h, tmp);
-	ROUND_WENC(plain_T, n, h, tmp);
-	TRANSFER(n, tmp);
-    }
  
-    state[0] = n0 ^ (hash->h8[0]);
-    state[1] = n1 ^ (hash->h8[1]);
-    state[2] = n2 ^ (hash->h8[2]);
-    state[3] = n3 ^ (hash->h8[3]);
-    state[4] = n4 ^ (hash->h8[4]);
-    state[5] = n5 ^ (hash->h8[5]);
-    state[6] = n6 ^ (hash->h8[6]);
-    state[7] = n7 ^ (hash->h8[7]);
-
-    n0 = 0x80;
-    n1 = n2 = n3 = n4 = n5 = n6 = 0;
-    n7 = 0x2000000000000;
-
-    h0 = state[0];
-    h1 = state[1];
-    h2 = state[2];
-    h3 = state[3];
-    h4 = state[4];
-    h5 = state[5];
-    h6 = state[6];
-    h7 = state[7];
-
-    n0 ^= h0;
-    n1 ^= h1;
-    n2 ^= h2;
-    n3 ^= h3;
-    n4 ^= h4;
-    n5 ^= h5;
-    n6 ^= h6;
-    n7 ^= h7;
-
-    for (unsigned r = 0; r < 10; r ++) {
-	sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-	ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
-	TRANSFER(h, tmp);
-	ROUND_WENC(plain_T, n, h, tmp);
-	TRANSFER(n, tmp);
-    }
+  // whirlpool
+  sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; 
+  sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+  sph_u64 state[8];
+
+  n0 = (hash->h8[0]);
+  n1 = (hash->h8[1]);
+  n2 = (hash->h8[2]);
+  n3 = (hash->h8[3]);
+  n4 = (hash->h8[4]);
+  n5 = (hash->h8[5]);
+  n6 = (hash->h8[6]);
+  n7 = (hash->h8[7]);
+
+  h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;
+
+  n0 ^= h0;
+  n1 ^= h1;
+  n2 ^= h2;
+  n3 ^= h3;
+  n4 ^= h4;
+  n5 ^= h5;
+  n6 ^= h6;
+  n7 ^= h7;
+
+  #pragma unroll 10
+  for (unsigned r = 0; r < 10; r ++) 
+  {
+    sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-    state[0] ^= n0 ^ 0x80;
-    state[1] ^= n1;
-    state[2] ^= n2;
-    state[3] ^= n3;
-    state[4] ^= n4;
-    state[5] ^= n5;
-    state[6] ^= n6;
-    state[7] ^= n7 ^ 0x2000000000000;
+    ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
+    TRANSFER(h, tmp);
+    ROUND_WENC(plain_T, n, h, tmp);
+    TRANSFER(n, tmp);
+  }

-    for (unsigned i = 0; i < 8; i ++)
-	hash->h8[i] = state[i];
+  state[0] = n0 ^ (hash->h8[0]);
+  state[1] = n1 ^ (hash->h8[1]);
+  state[2] = n2 ^ (hash->h8[2]);
+  state[3] = n3 ^ (hash->h8[3]);
+  state[4] = n4 ^ (hash->h8[4]);
+  state[5] = n5 ^ (hash->h8[5]);
+  state[6] = n6 ^ (hash->h8[6]);
+  state[7] = n7 ^ (hash->h8[7]);
+
+  n0 = 0x80;
+  n1 = n2 = n3 = n4 = n5 = n6 = 0;
+  n7 = 0x2000000000000;
+
+  h0 = state[0];
+  h1 = state[1];
+  h2 = state[2];
+  h3 = state[3];
+  h4 = state[4];
+  h5 = state[5];
+  h6 = state[6];
+  h7 = state[7];
+
+  n0 ^= h0;
+  n1 ^= h1;
+  n2 ^= h2;
+  n3 ^= h3;
+  n4 ^= h4;
+  n5 ^= h5;
+  n6 ^= h6;
+  n7 ^= h7;
+
+  #pragma unroll 10
+  for (unsigned r = 0; r < 10; r ++) 
+  {
+    sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-//    for(uint i = 0; i < 8; i++)
-//        output[(NUMHASH * 9) * 15 + gid * 9 + i] = hash->h8[i];
+    ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
+    TRANSFER(h, tmp);
+    ROUND_WENC(plain_T, n, h, tmp);
+    TRANSFER(n, tmp);
+  }

-//    output[(NUMHASH * 9) * 15 + gid * 9 + 8] = nonce;
+  state[0] ^= n0 ^ 0x80;
+  state[1] ^= n1;
+  state[2] ^= n2;
+  state[3] ^= n3;
+  state[4] ^= n4;
+  state[5] ^= n5;
+  state[6] ^= n6;
+  state[7] ^= n7 ^ 0x2000000000000;
+
+  for (unsigned i = 0; i < 8; i ++)
+    hash->h8[i] = state[i];

  bool result = (hash->h8[3] <= target);
  if (result)
--- a/kernel/bitblockold.cl
+++ b/kernel/bitblockold.cl
--- a/kernel/darkcoin-mod.cl
+++ b/kernel/darkcoin-mod.cl
@ -95,8 +95,6 @@
				@@ -95,8 +95,6 @@
 #include "shavite.cl"
 #include "simd.cl"
 #include "echo.cl"
-#include "hamsi.cl"
-#include "fugue.cl"

 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
 #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
				@@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search1(__global hash_t* hashes)
 {
-  uint gid = get_global_id(0);
+ uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

  // bmw
@ -456,91 +454,67 @@ __kernel void search2(__global hash_t* hashes)
				@@ -456,91 +454,67 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  #if !SPH_SMALL_FOOTPRINT_GROESTL
-    __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
-    __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
-  #else
-    __local sph_u64 T0_C[256], T4_C[256];
-  #endif
+  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];

  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_C[i] = T0[i];
-    T4_C[i] = T4[i];
-    #if !SPH_SMALL_FOOTPRINT_GROESTL
-      T1_C[i] = T1[i];
-      T2_C[i] = T2[i];
-      T3_C[i] = T3[i];
-      T5_C[i] = T5[i];
-      T6_C[i] = T6[i];
-      T7_C[i] = T7[i];
-    #endif
+    T0_L[i] = T0[i];
+    T4_L[i] = T4[i];
+    T1_L[i] = T1[i];
+    T2_L[i] = T2[i];
+    T3_L[i] = T3[i];
+    T5_L[i] = T5[i];
+    T6_L[i] = T6[i];
+    T7_L[i] = T7[i];
  }
 
-  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
-
-  #define T0 T0_C
-  #define T1 T1_C
-  #define T2 T2_C
-  #define T3 T3_C
-  #define T4 T4_C
-  #define T5 T5_C
-  #define T6 T6_C
-  #define T7 T7_C
-
-  sph_u64 H[16];
+  barrier(CLK_LOCAL_MEM_FENCE);

-  for (unsigned int u = 0; u < 15; u ++)
-    H[u] = 0;
+  #define T0 T0_L
+  #define T1 T1_L
+  #define T2 T2_L
+  #define T3 T3_L
+  #define T4 T4_L
+  #define T5 T5_L
+  #define T6 T6_L
+  #define T7 T7_L
 
-  #if USE_LE
-    H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
-  #else
-    H[15] = (sph_u64)512;
-  #endif
+  // groestl
+  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};

  sph_u64 g[16], m[16];
-  m[0] = DEC64E(hash->h8[0]);
-  m[1] = DEC64E(hash->h8[1]);
-  m[2] = DEC64E(hash->h8[2]);
-  m[3] = DEC64E(hash->h8[3]);
-  m[4] = DEC64E(hash->h8[4]);
-  m[5] = DEC64E(hash->h8[5]);
-  m[6] = DEC64E(hash->h8[6]);
-  m[7] = DEC64E(hash->h8[7]);
-  
-  for (unsigned int u = 0; u < 16; u ++)
-      g[u] = m[u] ^ H[u];
-      
-  m[8] = 0x80; g[8] = m[8] ^ H[8];
-  m[9] = 0; g[9] = m[9] ^ H[9];
-  m[10] = 0; g[10] = m[10] ^ H[10];
-  m[11] = 0; g[11] = m[11] ^ H[11];
-  m[12] = 0; g[12] = m[12] ^ H[12];
-  m[13] = 0; g[13] = m[13] ^ H[13];
-  m[14] = 0; g[14] = m[14] ^ H[14];
-  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
+  g[0] = m[0] = DEC64E(hash->h8[0]);
+  g[1] = m[1] = DEC64E(hash->h8[1]);
+  g[2] = m[2] = DEC64E(hash->h8[2]);
+  g[3] = m[3] = DEC64E(hash->h8[3]);
+  g[4] = m[4] = DEC64E(hash->h8[4]);
+  g[5] = m[5] = DEC64E(hash->h8[5]);
+  g[6] = m[6] = DEC64E(hash->h8[6]);
+  g[7] = m[7] = DEC64E(hash->h8[7]);
+  g[8] = m[8] = 0x80;
+  g[9] = m[9] = 0;
+  g[10] = m[10] = 0;
+  g[11] = m[11] = 0;
+  g[12] = m[12] = 0;
+  g[13] = m[13] = 0;
+  g[14] = m[14] = 0;
+  g[15] = 0x102000000000000;
+  m[15] = 0x100000000000000;
  
  PERM_BIG_P(g);
  PERM_BIG_Q(m);
  
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= g[u] ^ m[u];
-    
  sph_u64 xH[16];
-  
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u];
-  PERM_BIG_P(xH);
+    xH[u] = H[u] ^= g[u] ^ m[u];
      
-  for (unsigned int u = 0; u < 16; u ++)
-    H[u] ^= xH[u];
+  PERM_BIG_P(xH);
  
-  for (unsigned int u = 0; u < 8; u ++)
-    hash->h8[u] = DEC64E(H[u + 8]);
+  for (unsigned int u = 8; u < 16; u ++)
+    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);

  barrier(CLK_GLOBAL_MEM_FENCE); 
 }
@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
--- a/kernel/marucoin-mod.cl
+++ b/kernel/marucoin-mod.cl
--- a/kernel/marucoin-modold.cl
+++ b/kernel/marucoin-modold.cl
--- a/kernel/x14.cl
+++ b/kernel/x14.cl
--- a/kernel/x14old.cl
+++ b/kernel/x14old.cl
--- a/miner.h
+++ b/miner.h
@ -1033,6 +1033,7 @@ extern int swork_id;
				@@ -1033,6 +1033,7 @@ extern int swork_id;
 extern int opt_tcp_keepalive;
 extern bool opt_incognito;
 extern int opt_hamsi_expand_big;
+extern bool opt_hamsi_short;

 #if LOCK_TRACKING
 extern pthread_mutex_t lockstat_lock;
--- a/sgminer.c
+++ b/sgminer.c
@ -192,6 +192,7 @@ int nDevs;
				@@ -192,6 +192,7 @@ int nDevs;
 int opt_dynamic_interval = 7;
 int opt_g_threads = -1;
 int opt_hamsi_expand_big = 4;
+bool opt_hamsi_short = false;
 bool opt_restart = true;

 struct list_head scan_devices;
@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
				@@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
      "Set GPU lookup gap for scrypt mining, comma separated"),
  OPT_WITH_ARG("--hamsi-expand-big",
      set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big,
-      "Set SPH_HAMSI_EXPAND_BIG for X13 algorithms (1 or 4 are common)"),
+      "Set SPH_HAMSI_EXPAND_BIG for X13 derived algorithms (1 or 4 are common)"),
+  OPT_WITHOUT_ARG("--hamsi-short",
+      opt_set_bool, &opt_hamsi_short,
+      "Set SPH_HAMSI_SHORT for X13 derived algorithms (Can give better hashrate for some GPUs)"),
 #ifdef HAVE_CURSES
  OPT_WITHOUT_ARG("--incognito",
      opt_set_bool, &opt_incognito,
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@ -263,6 +263,7 @@
				@@ -263,6 +263,7 @@
    <ClCompile Include="..\algorithm\animecoin.c" />
    <ClCompile Include="..\algorithm\bitblock.c" />
    <ClCompile Include="..\algorithm\talkcoin.c" />
+    <ClCompile Include="..\algorithm\x14.c" />
    <ClCompile Include="..\api.c" />
    <ClCompile Include="..\ccan\opt\helpers.c" />
    <ClCompile Include="..\ccan\opt\opt.c" />
@ -329,6 +330,7 @@
				@@ -329,6 +330,7 @@
    <ClInclude Include="..\algorithm\animecoin.h" />
    <ClInclude Include="..\algorithm\bitblock.h" />
    <ClInclude Include="..\algorithm\talkcoin.h" />
+    <ClInclude Include="..\algorithm\x14.h" />
    <ClInclude Include="..\api.h" />
    <ClInclude Include="..\arg-nonnull.h" />
    <ClInclude Include="..\bench_block.h" />
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@ -227,6 +227,9 @@
				@@ -227,6 +227,9 @@
    <ClCompile Include="..\algorithm\bitblock.c">
      <Filter>Source Files\algorithm</Filter>
    </ClCompile>
+    <ClCompile Include="..\algorithm\x14.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\adl.h">
@ -412,6 +415,9 @@
				@@ -412,6 +415,9 @@
    <ClInclude Include="..\algorithm\bitblock.h">
      <Filter>Header Files\algorithm</Filter>
    </ClInclude>
+    <ClInclude Include="..\algorithm\x14.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="README.txt" />