Merged develop branch with master +updates

Moved the develop code to master. Moving forward all updates will be done on master unless it's work on a major feature. This update contains all previous develop code as well as a few new ones that weren't pushed yet: * Added neoscrypt compatibility for xintensity/rawintensity * Neoscrypt now uses correct TC if not specified or set to 0 * Reworked the application of pool settings on algorithm switch which should resolve TC/Intensity changes between algos such as X11 and neoscrypt
10 years ago · 4ba8a68043
39 changed files with 9736 additions and 1841 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
				@@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
 sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
 sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
 sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h
+sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
 sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h

 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
--- a/algorithm.c
+++ b/algorithm.c
@ -29,6 +29,7 @@
				@@ -29,6 +29,7 @@
 #include "algorithm/bitblock.h"
 #include "algorithm/x14.h"
 #include "algorithm/fresh.h"
+#include "algorithm/whirlcoin.h"
 #include "algorithm/neoscrypt.h"

 #include "compat.h"
@ -49,7 +50,9 @@ const char *algorithm_type_str[] = {
				@@ -49,7 +50,9 @@ const char *algorithm_type_str[] = {
  "Twecoin",
  "Fugue256",
  "NIST",
-  "Fresh"
+  "Fresh",
+  "Whirlcoin",
+  "Neoscrypt"
 };

 void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
				@@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
 static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
  char buf[255];
-  sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", 
-    (unsigned int)cgpu->thread_concurrency);
+  sprintf(buf, " %s-D MAX_GLOBAL_THREADS=%lu ",
+         ((cgpu->lookup_gap > 0)?" -D LOOKUP_GAP=2 ":""), (unsigned long)cgpu->thread_concurrency);
  strcat(data->compiler_options, buf);
-  
-  sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency);
+
+  sprintf(buf, "%stc%lu", ((cgpu->lookup_gap > 0)?"lg":""), (unsigned long)cgpu->thread_concurrency);
  strcat(data->binary_filename, buf);
 }

@ -158,21 +161,20 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
				@@ -158,21 +161,20 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
  unsigned int num = 0;
  cl_uint le_target;
  cl_int status = 0;
-  
+
  /* This looks like a unnecessary double cast, but to make sure, that
   * the target's most significant entry is adressed as a 32-bit value
   * and not accidently by something else the double cast seems wise.
-   * The compiler will get rid of it anyway. 
-   */
+   * The compiler will get rid of it anyway. */
  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
  memcpy(clState->cldata, blk->work->data, 80);
-  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
-  
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
  CL_SET_ARG(clState->CLbuffer0);
  CL_SET_ARG(clState->outputBuffer);
  CL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(le_target);
-  
+
  return status;
 }

@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx
				@@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx
  return status;
 }

+static cl_int queue_whirlcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+  //clbuffer, hashes
+  kernel = &clState->kernel;
+  CL_SET_ARG_N(0,clState->CLbuffer0);
+  CL_SET_ARG_N(1,clState->padbuffer8);
+
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_N(0,clState->padbuffer8);
+
+  CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
+
+  //hashes, output, target
+  CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
+  CL_SET_ARG_N(1,clState->outputBuffer);
+  CL_SET_ARG_N(2,le_target);
+
+  return status;
+}
+
 typedef struct _algorithm_settings_t {
  const char *name; /* Human-readable identifier */
  algorithm_type_t type; //common algorithm type
@ -624,23 +654,24 @@ typedef struct _algorithm_settings_t {
				@@ -624,23 +654,24 @@ typedef struct _algorithm_settings_t {
 static algorithm_settings_t algos[] = {
  // kernels starting from this will have difficulty calculated by using litecoin algorithm
 #define A_SCRYPT(a) \
-  { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options}
+    { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options}
  A_SCRYPT( "ckolivas" ),
  A_SCRYPT( "alexkarnew" ),
  A_SCRYPT( "alexkarnold" ),
  A_SCRYPT( "bufius" ),
  A_SCRYPT( "psw" ),
  A_SCRYPT( "zuikkis" ),
+  A_SCRYPT( "arebyp" ),
 #undef A_SCRYPT

 #define A_NEOSCRYPT(a) \
-  { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
+    { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
  A_NEOSCRYPT("neoscrypt"),
 #undef A_NEOSCRYPT

  // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
-  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
+    { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
  A_QUARK( "quarkcoin", quarkcoin_regenhash),
  A_QUARK( "qubitcoin", qubitcoin_regenhash),
  A_QUARK( "animecoin", animecoin_regenhash),
@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = {
				@@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = {

  // kernels starting from this will have difficulty calculated by using bitcoin algorithm
 #define A_DARK(a, b) \
-  { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
+    { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
  A_DARK( "darkcoin",           darkcoin_regenhash),
  A_DARK( "inkcoin",            inkcoin_regenhash),
  A_DARK( "myriadcoin-groestl", myriadcoin_groestl_regenhash),
 #undef A_DARK

  { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL},
-  { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL },
+  { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL},

-  { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
+  { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options},

-  { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options },
-  { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options },
-  { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options },
+  { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options},
+  { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options},
+  { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options},

-  { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options },
-  { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options },
+  { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options},
+  { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options},

-  { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options },
-  { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options },
+  { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options},
+  { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options},

-  { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
+  { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,  8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options},

-  { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL },
+  { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL},

  // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
-#define A_FUGUE(a, b) \
-  { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL }
-  A_FUGUE( "fuguecoin",   fuguecoin_regenhash),
-  A_FUGUE( "groestlcoin", groestlcoin_regenhash),
-#undef A_FUGUE
+#define A_FUGUE(a, b, c) \
+    { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL}
+  A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256),
+  A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256),
+  A_FUGUE("diamond", groestlcoin_regenhash, gen_hash),
+ #undef A_FUGUE
+
+  { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL},

  // Terminator (do not remove)
-  { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL }
+  { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL}
 };

 void copy_algorithm_settings(algorithm_t* dest, const char* algo)
@ -695,7 +729,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo)
				@@ -695,7 +729,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo)
    if (strcmp(src->name, algo) == 0)
    {
      strcpy(dest->name, src->name);
-	  dest->kernelfile = src->kernelfile;
+      dest->kernelfile = src->kernelfile;
      dest->type = src->type;

      dest->diff_multiplier1 = src->diff_multiplier1;
@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
				@@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
  ALGO_ALIAS("x15old", "bitblockold");
  ALGO_ALIAS("nist5", "talkcoin-mod");
  ALGO_ALIAS("keccak", "maxcoin");
+  ALGO_ALIAS("whirlpool", "whirlcoin");

  #undef ALGO_ALIAS
  #undef ALGO_ALIAS_NF
@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
				@@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa

 void set_algorithm(algorithm_t* algo, const char* newname_alias)
 {
-  const char* newname;
+  const char *newname;
+
  //load previous algorithm nfactor in case nfactor was applied before algorithm... or default to 10
  uint8_t old_nfactor = ((algo->nfactor)?algo->nfactor:0);
  //load previous kernel file name if was applied before algorithm...
@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)
				@@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)

 bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2)
 {
+  // return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor);
  return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor));
 }
--- a/algorithm.h
+++ b/algorithm.h
@ -24,6 +24,7 @@ typedef enum {
				@@ -24,6 +24,7 @@ typedef enum {
  ALGO_FUGUE,
  ALGO_NIST,
  ALGO_FRESH,
+  ALGO_WHIRL,
  ALGO_NEOSCRYPT
 } algorithm_type_t;

--- a/algorithm/neoscrypt.c
+++ b/algorithm/neoscrypt.c
@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
				@@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
        r = (1 << ((profile >> 5) & 0x7));
    }

-    uchar *stack;
-    stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align);
-
+    uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align];
    /* X = r * 2 * SCRYPT_BLOCK_SIZE */
    X = (uint *) &stack[stack_align & ~(stack_align - 1)];
    /* Z is a copy of X for ChaCha */
@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
				@@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {

    }

-    free(stack);
 }

 void neoscrypt_regenhash(struct work *work)
--- a/algorithm/neoscrypt.h
+++ b/algorithm/neoscrypt.h
@ -1,11 +1,10 @@
				@@ -1,11 +1,10 @@
 #ifndef NEOSCRYPT_H
 #define NEOSCRYPT_H
-  
+
 #include "miner.h"

 /* The neoscrypt scratch buffer needs 32kBytes memory. */
 #define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024)
-
 /* These routines are always available. */
 extern void neoscrypt_regenhash(struct work *work);
 extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
--- a/algorithm/whirlcoin.c
+++ b/algorithm/whirlcoin.c
@ -0,0 +1,171 @@
				@@ -0,0 +1,171 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "sph/sph_whirlpool.h"
+
+/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
+typedef struct {
+  sph_whirlpool1_context whirlpool1;
+  sph_whirlpool1_context whirlpool2;
+  sph_whirlpool1_context whirlpool3;
+  sph_whirlpool1_context whirlpool4;
+} Whash_context_holder;
+
+Whash_context_holder base_contexts;
+
+
+void init_whirlcoin_hash_contexts()
+{
+	sph_whirlpool1_init(&base_contexts.whirlpool1);
+	sph_whirlpool1_init(&base_contexts.whirlpool2);
+	sph_whirlpool1_init(&base_contexts.whirlpool3);
+	sph_whirlpool1_init(&base_contexts.whirlpool4);
+}
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+
+inline void whirlcoin_hash(void *state, const void *input)
+{
+    init_whirlcoin_hash_contexts();
+
+    Whash_context_holder ctx;
+    uint32_t hashA[16], hashB[16];
+
+    memcpy(&ctx, &base_contexts, sizeof(base_contexts));
+
+    sph_whirlpool1 (&ctx.whirlpool1, input, 80);
+    sph_whirlpool1_close (&ctx.whirlpool1, hashA);
+
+	sph_whirlpool1(&ctx.whirlpool2, hashA, 64);
+    sph_whirlpool1_close(&ctx.whirlpool2, hashB);
+
+    sph_whirlpool1(&ctx.whirlpool3, hashB, 64);
+    sph_whirlpool1_close(&ctx.whirlpool3, hashA);
+
+	sph_whirlpool1(&ctx.whirlpool4, hashA, 64);
+    sph_whirlpool1_close(&ctx.whirlpool4, hashB);
+
+    memcpy(state, hashB, 32);
+}
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+
+/* Used externally as confirmation of correct OCL code */
+int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+
+	whirlcoin_hash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void whirlcoin_regenhash(struct work *work)
+{
+    uint32_t data[20];
+    uint32_t *nonce = (uint32_t *)(work->data + 76);
+    uint32_t *ohash = (uint32_t *)(work->hash);
+
+    be32enc_vect(data, (const uint32_t *)work->data, 19);
+    data[19] = htobe32(*nonce);
+    whirlcoin_hash(ohash, data);
+}
+
+bool scanhash_whirlcoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) {
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[19] = (n);
+		whirlcoin_hash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
--- a/algorithm/whirlcoin.h
+++ b/algorithm/whirlcoin.h
@ -0,0 +1,9 @@
				@@ -0,0 +1,9 @@
+#ifndef W_H
+#define W_H
+
+#include "miner.h"
+
+extern int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void whirlcoin_regenhash(struct work *work);
+
+#endif /* W_H */
--- a/config_parser.c
+++ b/config_parser.c
@ -201,7 +201,7 @@ char *set_default_kernelfile(const char *arg)
				@@ -201,7 +201,7 @@ char *set_default_kernelfile(const char *arg)
 {
  applog(LOG_INFO, "Set default kernel file to %s", arg);
  default_profile.algorithm.kernelfile = arg;
-  
+
  return NULL;
 }

@ -324,10 +324,10 @@ char *set_profile_devices(const char *arg)
				@@ -324,10 +324,10 @@ char *set_profile_devices(const char *arg)
 char *set_profile_kernelfile(const char *arg)
 {
  struct profile *profile = get_current_profile();
-  
+
  applog(LOG_DEBUG, "Setting profile %s algorithm kernel file to %s", profile->name, arg);
  profile->algorithm.kernelfile = arg;
-  
+
  return NULL;
 }

@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname)
				@@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname)
    //set url
    curl_easy_setopt(curl, CURLOPT_URL, url);
    //set write callback and fileinfo
-    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb);
-    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file);
+    curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); // fail on 404 or other 4xx http codes
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30); // timeout after 30 secs to prevent being stuck
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); // stream to write data to
+    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb);  // callback function to write to config file

    if((res = curl_easy_perform(curl)) != CURLE_OK)
      applog(LOG_ERR, "Fetch remote file failed: %s", curl_easy_strerror(res));
@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil
				@@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil
      if((opt = opt_find(opt_config_table, optname)) != NULL)
      {
        //strings
-        if ((opt->type & OPT_HASARG) && json_is_string(val))
+        if ((opt->type & OPT_HASARG) && json_is_string(val)) {
          err = opt->cb_arg(json_string_value(val), opt->u.arg);
+        }
        //boolean values
-        else if ((opt->type & OPT_NOARG) && json_is_true(val))
+        else if ((opt->type & OPT_NOARG) && json_is_true(val)) {
          err = opt->cb(opt->u.arg);
-        else
+        }
+        else {
          err = "Invalid value";
+        }
      }
-      else
+      else {
        err = "Invalid option";
-
+      }
      break;
  }

@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u
				@@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u
  json_t *config;

  #ifdef HAVE_LIBCURL
-    //if detected as url
-    if((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL))
-    {
-      //download config file locally and reset arg to it so we can parse it
-      if((arg = fetch_remote_config(arg)) == NULL)
-        return NULL;
+    int retry = opt_remoteconf_retry;
+    const char *url;
+
+    // if detected as url
+    if ((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) {
+      url = strdup(arg);
+
+      do {
+        // wait for next retry
+        if (retry < opt_remoteconf_retry) {
+          sleep(opt_remoteconf_wait);
+        }
+
+        // download config file locally and reset arg to it so we can parse it
+        if ((arg = fetch_remote_config(url)) != NULL) {
+          break;
+        }
+
+        --retry;
+      } while (retry);
+
+      // file not downloaded... abort
+      if (arg == NULL) {
+        // if we should use last downloaded copy...
+        if (opt_remoteconf_usecache) {
+          char *p;
+
+          // extract filename out of url
+          if ((p = (char *)strrchr(url, '/')) == NULL) {
+            quit(1, "%s: invalid URL.", url);
+          }
+
+          arg = p+1;
+        } else {
+          quit(1, "%s: unable to download config file.", url);
+        }
+      }
    }
  #endif

-  //most likely useless but leaving it here for now...
-  if(!cnfbuf)
+  // most likely useless but leaving it here for now...
+  if (!cnfbuf) {
    cnfbuf = strdup(arg);
+  }

-  //no need to restrict the number of includes... if it causes problems, restore it later
+  // no need to restrict the number of includes... if it causes problems, restore it later
  /*if(++include_count > JSON_MAX_DEPTH)
    return JSON_MAX_DEPTH_ERR;
  */

-  //check if the file exists
-  if(access(arg, F_OK) == -1)
+  // check if the file exists
+  if (access(arg, F_OK) == -1) {
    quit(1, "%s: file not found.", arg);
+  }

-#if JANSSON_MAJOR_VERSION > 1
-  config = json_load_file(arg, 0, &err);
-#else
-  config = json_load_file(arg, &err);
-#endif
+  #if JANSSON_MAJOR_VERSION > 1
+    config = json_load_file(arg, 0, &err);
+  #else
+    config = json_load_file(arg, &err);
+  #endif

-  //if json root is not an object, error out
-  if(!json_is_object(config))
+  // if json root is not an object, error out
+  if (!json_is_object(config)) {
    return set_last_json_error("Error: JSON decode of file \"%s\" failed:\n %s", arg, err.text);
+  }

  config_loaded = true;

@ -1006,13 +1045,12 @@ void apply_pool_profile(struct pool *pool)
				@@ -1006,13 +1045,12 @@ void apply_pool_profile(struct pool *pool)
  if (empty_string(pool->algorithm.kernelfile)) {
    // ...but profile does, apply it to the pool
    if (!empty_string(profile->algorithm.kernelfile)) {
-      pool->algorithm.kernelfile = profile->algorithm.kernelfile;
-      applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
-      // ...or default profile does, apply it to the pool
-    }
-    else if (!empty_string(default_profile.algorithm.kernelfile)) {
-      pool->algorithm.kernelfile = default_profile.algorithm.kernelfile;
-      applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
+        pool->algorithm.kernelfile = profile->algorithm.kernelfile;
+        applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
+    // ...or default profile does, apply it to the pool
+    } else if (!empty_string(default_profile.algorithm.kernelfile)) {
+        pool->algorithm.kernelfile = default_profile.algorithm.kernelfile;
+        applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
    }
  }

@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool)
				@@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool)
  }
  applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap);

-  if(pool_cmp(pool->intensity, default_profile.intensity))
-  {
-    if(!empty_string(profile->intensity))
-        pool->intensity = profile->intensity;
-    else
-        pool->intensity = default_profile.intensity;
-  }
-  applog(LOG_DEBUG, "Pool %i Intensity set to \"%s\"", pool->pool_no, pool->intensity);
+  int int_type = 0;

-  if(pool_cmp(pool->xintensity, default_profile.xintensity))
-  {
-    if(!empty_string(profile->xintensity))
-        pool->xintensity = profile->xintensity;
-    else
+  // FIXME: ifs from hell...
+  // First look for an existing intensity on pool
+  if (!empty_string(pool->rawintensity)) {
+    int_type = 2;
+  }
+  else if (!empty_string(pool->xintensity)) {
+    int_type = 1;
+  }
+  else if (!empty_string(pool->intensity)) {
+    int_type = 0;
+  }
+  else {
+    //no intensity found on pool... check if the profile has one and use it...
+    if (!empty_string(profile->rawintensity)) {
+      int_type = 2;
+      pool->rawintensity = profile->rawintensity;
+    }
+    else if (!empty_string(profile->xintensity)) {
+      int_type = 1;
+      pool->xintensity = profile->xintensity;
+    }
+    else if (!empty_string(profile->intensity)) {
+      int_type = 0;
+      pool->intensity = profile->intensity;
+    }
+    else {
+      //nothing in profile... check default profile/globals
+      if (!empty_string(default_profile.rawintensity)) {
+        int_type = 2;
+        pool->rawintensity = default_profile.rawintensity;
+      }
+      else if (!empty_string(default_profile.xintensity)) {
+        int_type = 1;
        pool->xintensity = default_profile.xintensity;
+      }
+      else if (!empty_string(default_profile.intensity)) {
+        int_type = 0;
+        pool->intensity = default_profile.intensity;
+      }
+      else {
+        //nothing anywhere? default to sgminer default of 8
+        int_type = 0;
+        pool->intensity = strdup("8");
+      }
+    }
  }
-  applog(LOG_DEBUG, "Pool %i XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);

-  if(pool_cmp(pool->rawintensity, default_profile.rawintensity))
-  {
-    if(!empty_string(profile->rawintensity))
-        pool->rawintensity = profile->rawintensity;
-    else
-        pool->rawintensity = default_profile.rawintensity;
+  switch(int_type) {
+    case 2:
+      applog(LOG_DEBUG, "Pool %d Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
+      break;
+
+    case 1:
+      applog(LOG_DEBUG, "Pool %d XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);
+      break;
+
+    default:
+      applog(LOG_DEBUG, "Pool %d Intensity set to \"%s\"", pool->pool_no, pool->intensity);
+      break;
  }
-  applog(LOG_DEBUG, "Pool %i Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);

  if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency))
  {
-    if(!empty_string(profile->thread_concurrency))
-        pool->thread_concurrency = profile->thread_concurrency;
-    else
-        pool->thread_concurrency = default_profile.thread_concurrency;
+    /* allow empty string TC
+      if(!empty_string(profile->thread_concurrency))*/
+      pool->thread_concurrency = profile->thread_concurrency;
+/*    else
+        pool->thread_concurrency = default_profile.thread_concurrency;*/
  }
  applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency);

@ -1303,7 +1378,7 @@ static json_t *build_pool_json()
				@@ -1303,7 +1378,7 @@ static json_t *build_pool_json()
    // devices
    if (!build_pool_json_add(obj, "device", pool->devices, profile->devices, default_profile.devices, pool->pool_no))
      return NULL;
-    
+
    // kernelfile
    if (!build_pool_json_add(obj, "kernelfile", pool->algorithm.kernelfile, profile->algorithm.kernelfile, default_profile.algorithm.kernelfile, pool->pool_no))
      return NULL;
--- a/configure.ac
+++ b/configure.ac
@ -1,8 +1,8 @@
				@@ -1,8 +1,8 @@
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_maj], [5])
-m4_define([v_min], [0])
-m4_define([v_mic], [1])
+m4_define([v_min], [1])
+m4_define([v_mic], [0])
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_ver], [v_maj.v_min.v_mic])
 m4_define([lt_rev], m4_eval(v_maj + v_min))
--- a/driver-opencl.c
+++ b/driver-opencl.c
@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg)
				@@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg)
  return NULL;
 }

-char *set_thread_concurrency(const char *_arg)
+char *set_thread_concurrency(const char *arg)
 {
-  int i, val = 0, device = 0;
+  int i, device = 0;
+  size_t val = 0;
+  char *tmpstr = strdup(arg);
  char *nextptr;
-  char *arg = (char *)alloca(strlen(_arg) + 1);
-  strcpy(arg, _arg);

-  nextptr = strtok(arg, ",");
-  if (nextptr == NULL)
-    return "Invalid parameters for set thread concurrency";
-  val = atoi(nextptr);
+  // empty string - use 0 and let algo autodetect the TC
+  if (empty_string(tmpstr)) {
+    applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
+    gpus[device++].opt_tc = val;
+  }
+  // not empty string
+  else {
+    if ((nextptr = strtok(tmpstr, ",")) == NULL) {
+      free(tmpstr);
+      return "Invalid parameters for set_thread_concurrency";
+    }

-  gpus[device++].opt_tc = val;
+    do {
+      val = (unsigned long)atol(nextptr);

-  while ((nextptr = strtok(NULL, ",")) != NULL) {
-    val = atoi(nextptr);
-
-    gpus[device++].opt_tc = val;
+      applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
+      gpus[device++].opt_tc = val;
+    } while ((nextptr = strtok(NULL, ",")) != NULL);
  }
+
+  // if only 1 TC was passed, assign the same worksize for all remaining GPUs
  if (device == 1) {
-    for (i = device; i < MAX_GPUDEVICES; i++)
+    for (i = device; i < total_devices; ++i) {
      gpus[i].opt_tc = gpus[0].opt_tc;
+      applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", i, gpus[i].opt_tc);
+    }
  }

+  free(tmpstr);
  return NULL;
 }

@ -1020,21 +1032,24 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader
				@@ -1020,21 +1032,24 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader
 {
  unsigned int threads = 0;
  while (threads < minthreads) {
+
    if (*rawintensity > 0) {
      threads = *rawintensity;
-    } else if (*xintensity > 0) {
-      if (algorithm->xintensity_shift)
-        threads = compute_shaders * (1 << (algorithm->xintensity_shift + *xintensity));
-      else
-        threads = compute_shaders * *xintensity;
-    } else {
+    }
+    else if (*xintensity > 0) {
+      threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity);
+    }
+    else {
      threads = 1 << (algorithm->intensity_shift + *intensity);
    }
+
    if (threads < minthreads) {
-      if (likely(*intensity < MAX_INTENSITY))
+      if (likely(*intensity < MAX_INTENSITY)) {
        (*intensity)++;
-      else
+      }
+      else {
        threads = minthreads;
+      }
    }
  }

--- a/findnonce.c
+++ b/findnonce.c
@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata)
				@@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata)
    if (found == 0x0F)
        nonce = swab32(nonce);

-    applog(LOG_DEBUG, "OCL NONCE %u found in slot %d", nonce, entry);
+    applog(LOG_DEBUG, "[THR%d] OCL NONCE %08x (%lu) found in slot %d (found = %d)", thr->id, nonce, nonce, entry, found);
    submit_nonce(thr, pcd->work, nonce);
  }

--- a/kernel/animecoin.cl
+++ b/kernel/animecoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -92,6 +92,14 @@ typedef long sph_s64;
				@@ -92,6 +92,14 @@ typedef long sph_s64;
 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
 #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+  q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+  q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+  q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 #if SPH_BIG_ENDIAN
  #define DEC64E(x) (x)
  #define DEC64BE(x) (*(const __global sph_u64 *) (x));
@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    for(unsigned u = 0; u < 16; u++)
      BMW_H[u] = BMW_IV512[u];

-    sph_u64 BMW_h1[16], BMW_h2[16];
-    sph_u64 mv[16];
+    sph_u64 mv[16],q[32];
+      sph_u64 tmp;

    mv[0] = DEC64LE(block +   0);
    mv[1] = DEC64LE(block +   8);
@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    mv[13] = 0;
    mv[14] = 0;
    mv[15] = 0x280;
-#define M(x)    (mv[x])
-#define H(x)    (BMW_H[x])
-#define dH(x)   (BMW_h2[x])

-    FOLDb;
+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+  sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+#pragma unroll 16
+  for(int i=0;i<16;i++)
+  {
+  mv[i] = BMW_H[i];
+  BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
+  }

-#undef M
-#undef H
-#undef dH
+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }

-#define M(x)    (BMW_h2[x])
-#define H(x)    (final_b[x])
-#define dH(x)   (BMW_h1[x])
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }

-    FOLDb;
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }

-#undef M
-#undef H
-#undef dH
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }

-    hash.h8[0] = SWAP8(BMW_h1[8]);
-    hash.h8[1] = SWAP8(BMW_h1[9]);
-    hash.h8[2] = SWAP8(BMW_h1[10]);
-    hash.h8[3] = SWAP8(BMW_h1[11]);
-    hash.h8[4] = SWAP8(BMW_h1[12]);
-    hash.h8[5] = SWAP8(BMW_h1[13]);
-    hash.h8[6] = SWAP8(BMW_h1[14]);
-    hash.h8[7] = SWAP8(BMW_h1[15]);
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+  XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+  hash.h8[0] = SWAP8(BMW_H[8]);
+  hash.h8[1] = SWAP8(BMW_H[9]);
+  hash.h8[2] = SWAP8(BMW_H[10]);
+  hash.h8[3] = SWAP8(BMW_H[11]);
+  hash.h8[4] = SWAP8(BMW_H[12]);
+  hash.h8[5] = SWAP8(BMW_H[13]);
+  hash.h8[6] = SWAP8(BMW_H[14]);
+  hash.h8[7] = SWAP8(BMW_H[15]);
  }

  // blake
@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
  bool dec = ((hash.h1[7] & 0x8) != 0);
  {
    // groestl
-
    sph_u64 H[16];
    for (unsigned int u = 0; u < 15; u ++)
      H[u] = 0;
@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    hash.h8[7] = (dec ? H7 : hash.h8[7]);
  }
  {
-
    // bmw
    sph_u64 BMW_H[16];
    for(unsigned u = 0; u < 16; u++)
      BMW_H[u] = BMW_IV512[u];

-    sph_u64 BMW_h1[16], BMW_h2[16];
-    sph_u64 mv[16];
+    sph_u64 mv[16],q[32];
+      sph_u64 tmp;

    mv[ 0] = SWAP8(hash.h8[0]);
    mv[ 1] = SWAP8(hash.h8[1]);
@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    mv[13] = 0;
    mv[14] = 0;
    mv[15] = 0x200;
-  #define M(x)    (mv[x])
-  #define H(x)    (BMW_H[x])
-  #define dH(x)   (BMW_h2[x])

-    FOLDb;
+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }

-  #undef M
-  #undef H
-  #undef dH
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }

-  #define M(x)    (BMW_h2[x])
-  #define H(x)    (final_b[x])
-  #define dH(x)   (BMW_h1[x])
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }

-    FOLDb;
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }

-  #undef M
-  #undef H
-  #undef dH
+  sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+#pragma unroll 16
+  for(int i=0;i<16;i++)
+  {
+  mv[i] = BMW_H[i];
+  BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
+  }
+
+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }

-    hash.h8[0] = (!dec ? SWAP8(BMW_h1[8]) : hash.h8[0]);
-    hash.h8[1] = (!dec ? SWAP8(BMW_h1[9]) : hash.h8[1]);
-    hash.h8[2] = (!dec ? SWAP8(BMW_h1[10]) : hash.h8[2]);
-    hash.h8[3] = (!dec ? SWAP8(BMW_h1[11]) : hash.h8[3]);
-    hash.h8[4] = (!dec ? SWAP8(BMW_h1[12]) : hash.h8[4]);
-    hash.h8[5] = (!dec ? SWAP8(BMW_h1[13]) : hash.h8[5]);
-    hash.h8[6] = (!dec ? SWAP8(BMW_h1[14]) : hash.h8[6]);
-    hash.h8[7] = (!dec ? SWAP8(BMW_h1[15]) : hash.h8[7]);
+  XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+    hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]);
+    hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]);
+    hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]);
+    hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]);
+    hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]);
+    hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]);
+    hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]);
+    hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]);

  }

@ -642,4 +1064,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -642,4 +1064,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    output[output[0xFF]++] = SWAP4(gid);
 }

-#endif // ANIMECOIN_CL
+#endif // ANIMECOIN_CL
--- a/kernel/arebyp.cl
+++ b/kernel/arebyp.cl
@ -0,0 +1,993 @@
				@@ -0,0 +1,993 @@
+  /*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
+ * 2012-2013 Con Kolivas, 2013 Alexey Karimov.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+ /* N (nfactor), CPU/Memory cost parameter */
+__constant uint N[] = {
+	0x00000001U,  /* never used, padding */
+	0x00000002U,
+	0x00000004U,
+	0x00000008U,
+	0x00000010U,
+	0x00000020U,
+	0x00000040U,
+	0x00000080U,
+	0x00000100U,
+	0x00000200U,
+	0x00000400U,  /* 2^10 == 1024, Litecoin scrypt default */
+	0x00000800U,
+	0x00001000U,
+	0x00002000U,
+	0x00004000U,
+	0x00008000U,
+	0x00010000U,
+	0x00020000U,
+	0x00040000U,
+	0x00080000U,
+	0x00100000U
+};
+
+/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */
+#ifndef NFACTOR
+#define NFACTOR 10
+#endif
+
+
+__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };
+__constant uint K[] = {
+	0x428a2f98U,
+	0x71374491U,
+	0xb5c0fbcfU,
+	0xe9b5dba5U,
+	0x3956c25bU,
+	0x59f111f1U,
+	0x923f82a4U,
+	0xab1c5ed5U,
+	0xd807aa98U,
+	0x12835b01U,
+	0x243185beU, // 10
+	0x550c7dc3U,
+	0x72be5d74U,
+	0x80deb1feU,
+	0x9bdc06a7U,
+	0xe49b69c1U,
+	0xefbe4786U,
+	0x0fc19dc6U,
+	0x240ca1ccU,
+	0x2de92c6fU,
+	0x4a7484aaU, // 20
+	0x5cb0a9dcU,
+	0x76f988daU,
+	0x983e5152U,
+	0xa831c66dU,
+	0xb00327c8U,
+	0xbf597fc7U,
+	0xc6e00bf3U,
+	0xd5a79147U,
+	0x06ca6351U,
+	0x14292967U, // 30
+	0x27b70a85U,
+	0x2e1b2138U,
+	0x4d2c6dfcU,
+	0x53380d13U,
+	0x650a7354U,
+	0x766a0abbU,
+	0x81c2c92eU,
+	0x92722c85U,
+	0xa2bfe8a1U,
+	0xa81a664bU, // 40
+	0xc24b8b70U,
+	0xc76c51a3U,
+	0xd192e819U,
+	0xd6990624U,
+	0xf40e3585U,
+	0x106aa070U,
+	0x19a4c116U,
+	0x1e376c08U,
+	0x2748774cU,
+	0x34b0bcb5U, // 50
+	0x391c0cb3U,
+	0x4ed8aa4aU,
+	0x5b9cca4fU,
+	0x682e6ff3U,
+	0x748f82eeU,
+	0x78a5636fU,
+	0x84c87814U,
+	0x8cc70208U,
+	0x90befffaU,
+	0xa4506cebU, // 60
+	0xbef9a3f7U,
+	0xc67178f2U,
+	0x98c7e2a2U,
+	0xfc08884dU,
+	0xcd2a11aeU,
+	0x510e527fU,
+	0x9b05688cU,
+	0xC3910C8EU,
+	0xfb6feee7U,
+	0x2a01a605U, // 70
+	0x0c2e12e0U,
+	0x4498517BU,
+	0x6a09e667U,
+	0xa4ce148bU,
+	0x95F61999U,
+	0xc19bf174U,
+	0xBB67AE85U,
+	0x3C6EF372U,
+	0xA54FF53AU,
+	0x1F83D9ABU, // 80
+	0x5BE0CD19U,
+	0x5C5C5C5CU,
+	0x36363636U,
+	0x80000000U,
+	0x000003FFU,
+	0x00000280U,
+	0x000004a0U,
+	0x00000300U
+};
+
+#define rotl(x,y) rotate(x,y)
+#define Ch(x,y,z) bitselect(z,y,x)
+#define Maj(x,y,z) Ch((x^z),y,z)
+
+#define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U))
+
+#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
+#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
+#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
+#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
+
+#define RND(a, b, c, d, e, f, g, h, k)	\
+	h += Tr1(e); 			\
+	h += Ch(e, f, g); 		\
+	h += k;				\
+	d += h;				\
+	h += Tr2(a); 			\
+	h += Maj(a, b, c);
+
+void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	uint4 W[4];
+
+	W[ 0].x = block0.x;
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]);
+	W[ 0].y = block0.y;
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]);
+	W[ 0].z = block0.z;
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]);
+	W[ 0].w = block0.w;
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]);
+
+	W[ 1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
+	W[ 1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
+	W[ 1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
+	W[ 1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
+
+	W[ 2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
+	W[ 2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
+	W[ 2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
+	W[ 2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
+
+	W[ 3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
+	W[ 3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
+	W[ 3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
+	W[ 3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
+
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+#define A (*state0).x
+#define B (*state0).y
+#define C (*state0).z
+#define D (*state0).w
+#define E (*state1).x
+#define F (*state1).y
+#define G (*state1).z
+#define H (*state1).w
+
+	uint4 W[4];
+
+	W[0].x = block0.x;
+	D= K[63] +W[0].x;
+	H= K[64] +W[0].x;
+
+	W[0].y = block0.y;
+	C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y;
+	G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]);
+
+	W[0].z = block0.z;
+	B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z;
+	F= K[72] +B+Tr2(G)+Maj(G,H, K[73]);
+
+	W[0].w = block0.w;
+	A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w;
+	E= K[75] +A+Tr2(F)+Maj(F,G,H);
+
+	W[1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
+	W[1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
+	W[1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
+	W[1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
+
+	W[2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
+	W[2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
+	W[2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
+	W[2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
+
+	W[3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
+	W[3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
+	W[3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
+	W[3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
+
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += (uint4)(K[73], K[77], K[78], K[79]);
+	*state1 += (uint4)(K[66], K[67], K[80], K[81]);
+}
+
+__constant uint fixedW[64] =
+{
+	0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
+	0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
+	0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
+	0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
+	0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
+	0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
+	0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
+};
+
+void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	RND(A,B,C,D,E,F,G,H, fixedW[0]);
+	RND(H,A,B,C,D,E,F,G, fixedW[1]);
+	RND(G,H,A,B,C,D,E,F, fixedW[2]);
+	RND(F,G,H,A,B,C,D,E, fixedW[3]);
+	RND(E,F,G,H,A,B,C,D, fixedW[4]);
+	RND(D,E,F,G,H,A,B,C, fixedW[5]);
+	RND(C,D,E,F,G,H,A,B, fixedW[6]);
+	RND(B,C,D,E,F,G,H,A, fixedW[7]);
+	RND(A,B,C,D,E,F,G,H, fixedW[8]);
+	RND(H,A,B,C,D,E,F,G, fixedW[9]);
+	RND(G,H,A,B,C,D,E,F, fixedW[10]);
+	RND(F,G,H,A,B,C,D,E, fixedW[11]);
+	RND(E,F,G,H,A,B,C,D, fixedW[12]);
+	RND(D,E,F,G,H,A,B,C, fixedW[13]);
+	RND(C,D,E,F,G,H,A,B, fixedW[14]);
+	RND(B,C,D,E,F,G,H,A, fixedW[15]);
+	RND(A,B,C,D,E,F,G,H, fixedW[16]);
+	RND(H,A,B,C,D,E,F,G, fixedW[17]);
+	RND(G,H,A,B,C,D,E,F, fixedW[18]);
+	RND(F,G,H,A,B,C,D,E, fixedW[19]);
+	RND(E,F,G,H,A,B,C,D, fixedW[20]);
+	RND(D,E,F,G,H,A,B,C, fixedW[21]);
+	RND(C,D,E,F,G,H,A,B, fixedW[22]);
+	RND(B,C,D,E,F,G,H,A, fixedW[23]);
+	RND(A,B,C,D,E,F,G,H, fixedW[24]);
+	RND(H,A,B,C,D,E,F,G, fixedW[25]);
+	RND(G,H,A,B,C,D,E,F, fixedW[26]);
+	RND(F,G,H,A,B,C,D,E, fixedW[27]);
+	RND(E,F,G,H,A,B,C,D, fixedW[28]);
+	RND(D,E,F,G,H,A,B,C, fixedW[29]);
+	RND(C,D,E,F,G,H,A,B, fixedW[30]);
+	RND(B,C,D,E,F,G,H,A, fixedW[31]);
+	RND(A,B,C,D,E,F,G,H, fixedW[32]);
+	RND(H,A,B,C,D,E,F,G, fixedW[33]);
+	RND(G,H,A,B,C,D,E,F, fixedW[34]);
+	RND(F,G,H,A,B,C,D,E, fixedW[35]);
+	RND(E,F,G,H,A,B,C,D, fixedW[36]);
+	RND(D,E,F,G,H,A,B,C, fixedW[37]);
+	RND(C,D,E,F,G,H,A,B, fixedW[38]);
+	RND(B,C,D,E,F,G,H,A, fixedW[39]);
+	RND(A,B,C,D,E,F,G,H, fixedW[40]);
+	RND(H,A,B,C,D,E,F,G, fixedW[41]);
+	RND(G,H,A,B,C,D,E,F, fixedW[42]);
+	RND(F,G,H,A,B,C,D,E, fixedW[43]);
+	RND(E,F,G,H,A,B,C,D, fixedW[44]);
+	RND(D,E,F,G,H,A,B,C, fixedW[45]);
+	RND(C,D,E,F,G,H,A,B, fixedW[46]);
+	RND(B,C,D,E,F,G,H,A, fixedW[47]);
+	RND(A,B,C,D,E,F,G,H, fixedW[48]);
+	RND(H,A,B,C,D,E,F,G, fixedW[49]);
+	RND(G,H,A,B,C,D,E,F, fixedW[50]);
+	RND(F,G,H,A,B,C,D,E, fixedW[51]);
+	RND(E,F,G,H,A,B,C,D, fixedW[52]);
+	RND(D,E,F,G,H,A,B,C, fixedW[53]);
+	RND(C,D,E,F,G,H,A,B, fixedW[54]);
+	RND(B,C,D,E,F,G,H,A, fixedW[55]);
+	RND(A,B,C,D,E,F,G,H, fixedW[56]);
+	RND(H,A,B,C,D,E,F,G, fixedW[57]);
+	RND(G,H,A,B,C,D,E,F, fixedW[58]);
+	RND(F,G,H,A,B,C,D,E, fixedW[59]);
+	RND(E,F,G,H,A,B,C,D, fixedW[60]);
+	RND(D,E,F,G,H,A,B,C, fixedW[61]);
+	RND(C,D,E,F,G,H,A,B, fixedW[62]);
+	RND(B,C,D,E,F,G,H,A, fixedW[63]);
+
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void shittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
+	tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
+	tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
+	tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
+	tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
+	tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
+	tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void unshittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
+	tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
+	tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
+	tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
+	tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
+	tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
+	tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+#define SALSAUNROLLED
+#ifdef SALSAUNROLLED
+void salsa(uint4 B[8])
+{
+        uint4 w[4];
+
+        w[0] = (B[0]^=B[4]);
+        w[1] = (B[1]^=B[5]);
+        w[2] = (B[2]^=B[6]);
+        w[3] = (B[3]^=B[7]);
+
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+
+        w[0] = (B[4]^=(B[0]+=w[0]));
+        w[1] = (B[5]^=(B[1]+=w[1]));
+        w[2] = (B[6]^=(B[2]+=w[2]));
+        w[3] = (B[7]^=(B[3]+=w[3]));
+
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+        w[0] ^= rotl(w[3]     +w[2]     , 7U);
+        w[1] ^= rotl(w[0]     +w[3]     , 9U);
+        w[2] ^= rotl(w[1]     +w[0]     ,13U);
+        w[3] ^= rotl(w[2]     +w[1]     ,18U);
+        w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+        w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+        w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+        w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+
+        B[4] += w[0];
+        B[5] += w[1];
+        B[6] += w[2];
+        B[7] += w[3];
+}
+#else
+void salsa(uint4 B[8])
+{
+	uint4 w[4];
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i]^=B[i+4]);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i+4]^=(B[i]+=w[i]));
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] += w[i];
+}
+#endif
+
+
+#if (LOOKUP_GAP != 2)
+wrong lookup gap!
+#endif
+
+void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+{
+  const uint zSIZE = 8;
+  const uint ySIZE = N[NFACTOR] / 2;
+  const uint xSIZE = CONCURRENT_THREADS;
+  uint4 V[8];
+  uint x = get_global_id(0) % xSIZE;
+  uint z;
+  uint y;
+  uint i;
+  uint CO;
+  ushort progress;  // Progress state
+  ushort state;
+
+  CO = 8 * x;
+  for (y = 0; y < ySIZE; ++y, CO += (xSIZE - 1) * (zSIZE))
+    {
+#pragma unroll
+      for (z = 0; z < zSIZE; ++z, CO++)
+		lookup[CO] = X[z];
+      // Next salsa
+      salsa(X);
+      salsa(X);
+    }
+  //------------------------------------------------------------------------------------------------------------
+  uint cotmp = x * zSIZE;
+  progress = 0;
+  for (i = 0; i < N[NFACTOR] + 512 + 42; i++)
+    {
+        //if (progress < 2 * N[NFACTOR])
+		{
+			y = X[7].x & (N[NFACTOR]-1);
+			CO = cotmp + (y / LOOKUP_GAP) * (xSIZE)  * zSIZE;
+
+			state = ((progress & 1) << 1) | (y & 1);
+			if (state != 3)
+			{
+#pragma unroll
+				for (z = 0; z < zSIZE; ++z, CO++)
+					V[z] = lookup[CO];
+			}
+			if (state != 1)
+			{
+#pragma unroll
+				for (z = 0; z < zSIZE; ++z)
+					V[z] ^= X[z];
+			}
+			salsa(V);
+			ushort cond  = (state != 1) && (progress < 2 * N[NFACTOR]);
+			if (cond)
+			{
+#pragma unroll
+				for (z = 0; z < zSIZE; ++z)
+					X[z] = V[z];
+			}
+			// S/y
+			// 00  +2
+			// 01  +1
+			// 11  +1
+			// 10  error
+			progress += (state == 0)? 2 : 1;
+		}
+    }
+}
+
+
+
+
+#define SCRYPT_FOUND (0xFF)
+#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uint4 * restrict input,
+volatile __global uint*restrict output, __global uint4*restrict padcache,
+const uint4 midstate0, const uint4 midstate16, const uint target)
+{
+	uint4 X[8];
+	uint4 tstate0, tstate1, ostate0, ostate1;
+	uint4 tmp0, tmp1;
+	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z, get_global_id(0));
+	uint4 pad0 = midstate0, pad1 = midstate16;
+
+	SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86]));
+	SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]);
+	SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]);
+
+	tmp0 = tstate0;
+	tmp1 = tstate1;
+	SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
+
+#pragma unroll
+	for (uint i=0; i<4; i++)
+	{
+		pad0 = tstate0;
+		pad1 = tstate1;
+		X[rotl(i,1U) ] = ostate0;
+		X[rotl(i,1U)+1] = ostate1;
+
+		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
+		SHA256(X+rotl(i,1U),X+rotl(i,1U)+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
+	}
+	shittify(X);
+	scrypt_core(X,padcache);
+	unshittify(X);
+
+	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
+	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
+	SHA256_fixed(&tmp0,&tmp1);
+	SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
+
+	bool result = (EndianSwap(ostate1.w) <= target);
+	if (result)
+		SETFOUND(get_global_id(0));
+}
--- a/kernel/bitblock.cl
+++ b/kernel/bitblock.cl
@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes)
				@@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
-
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_L[i] = T0[i];
-    T4_L[i] = T4[i];
-    T1_L[i] = T1[i];
-    T2_L[i] = T2[i];
-    T3_L[i] = T3[i];
-    T5_L[i] = T5[i];
-    T6_L[i] = T6[i];
-    T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  #define T0 T0_L
-  #define T1 T1_L
-  #define T2 T2_L
-  #define T3 T3_L
-  #define T4 T4_L
-  #define T5 T5_L
-  #define T6 T6_L
-  #define T7 T7_L
-
-  // groestl
-  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C
+
+
+  sph_u64 H[16];
+//#pragma unroll 15
+  for (unsigned int u = 0; u < 15; u ++)
+    H[u] = 0;
+#if USE_LE
+  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
+#else
+  H[15] = (sph_u64)512;
+#endif

  sph_u64 g[16], m[16];
-  g[0] = m[0] = DEC64E(hash->h8[0]);
-  g[1] = m[1] = DEC64E(hash->h8[1]);
-  g[2] = m[2] = DEC64E(hash->h8[2]);
-  g[3] = m[3] = DEC64E(hash->h8[3]);
-  g[4] = m[4] = DEC64E(hash->h8[4]);
-  g[5] = m[5] = DEC64E(hash->h8[5]);
-  g[6] = m[6] = DEC64E(hash->h8[6]);
-  g[7] = m[7] = DEC64E(hash->h8[7]);
-  g[8] = m[8] = 0x80;
-  g[9] = m[9] = 0;
-  g[10] = m[10] = 0;
-  g[11] = m[11] = 0;
-  g[12] = m[12] = 0;
-  g[13] = m[13] = 0;
-  g[14] = m[14] = 0;
-  g[15] = 0x102000000000000;
-  m[15] = 0x100000000000000;
-
+  m[0] = DEC64E(hash->h8[0]);
+  m[1] = DEC64E(hash->h8[1]);
+  m[2] = DEC64E(hash->h8[2]);
+  m[3] = DEC64E(hash->h8[3]);
+  m[4] = DEC64E(hash->h8[4]);
+  m[5] = DEC64E(hash->h8[5]);
+  m[6] = DEC64E(hash->h8[6]);
+  m[7] = DEC64E(hash->h8[7]);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    g[u] = m[u] ^ H[u];
+  m[8] = 0x80; g[8] = m[8] ^ H[8];
+  m[9] = 0; g[9] = m[9] ^ H[9];
+  m[10] = 0; g[10] = m[10] ^ H[10];
+  m[11] = 0; g[11] = m[11] ^ H[11];
+  m[12] = 0; g[12] = m[12] ^ H[12];
+  m[13] = 0; g[13] = m[13] ^ H[13];
+  m[14] = 0; g[14] = m[14] ^ H[14];
+  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
  PERM_BIG_P(g);
  PERM_BIG_Q(m);

-  sph_u64 xH[16];
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u] ^= g[u] ^ m[u];
+    H[u] ^= g[u] ^ m[u];
+  sph_u64 xH[16];

+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    xH[u] = H[u];
  PERM_BIG_P(xH);

-  for (unsigned int u = 8; u < 16; u ++)
-    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= xH[u];
+
+//#pragma unroll 8
+  for (unsigned int u = 0; u < 8; u ++)
+    hash->h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);

-  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -1466,4 +1489,4 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
				@@ -1466,4 +1489,4 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
  barrier(CLK_GLOBAL_MEM_FENCE);
 }

-#endif // BITBLOCK_CL
+#endif // BITBLOCK_CL
--- a/kernel/darkcoin-mod.cl
+++ b/kernel/darkcoin-mod.cl
@ -193,7 +193,7 @@ __kernel void search1(__global hash_t* hashes)
				@@ -193,7 +193,7 @@ __kernel void search1(__global hash_t* hashes)
  BMW_H[u] = BMW_IV512[u];

  sph_u64 mv[16],q[32];
-  sph_u64 tmp;
+    sph_u64 tmp;

  mv[0] = SWAP8(hash->h8[0]);
  mv[1] = SWAP8(hash->h8[1]);
@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes)
				@@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
-
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_L[i] = T0[i];
-    T4_L[i] = T4[i];
-    T1_L[i] = T1[i];
-    T2_L[i] = T2[i];
-    T3_L[i] = T3[i];
-    T5_L[i] = T5[i];
-    T6_L[i] = T6[i];
-    T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  #define T0 T0_L
-  #define T1 T1_L
-  #define T2 T2_L
-  #define T3 T3_L
-  #define T4 T4_L
-  #define T5 T5_L
-  #define T6 T6_L
-  #define T7 T7_L
-
-  // groestl
-  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C
+
+
+  sph_u64 H[16];
+//#pragma unroll 15
+  for (unsigned int u = 0; u < 15; u ++)
+    H[u] = 0;
+#if USE_LE
+  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
+#else
+  H[15] = (sph_u64)512;
+#endif

  sph_u64 g[16], m[16];
-  g[0] = m[0] = DEC64E(hash->h8[0]);
-  g[1] = m[1] = DEC64E(hash->h8[1]);
-  g[2] = m[2] = DEC64E(hash->h8[2]);
-  g[3] = m[3] = DEC64E(hash->h8[3]);
-  g[4] = m[4] = DEC64E(hash->h8[4]);
-  g[5] = m[5] = DEC64E(hash->h8[5]);
-  g[6] = m[6] = DEC64E(hash->h8[6]);
-  g[7] = m[7] = DEC64E(hash->h8[7]);
-  g[8] = m[8] = 0x80;
-  g[9] = m[9] = 0;
-  g[10] = m[10] = 0;
-  g[11] = m[11] = 0;
-  g[12] = m[12] = 0;
-  g[13] = m[13] = 0;
-  g[14] = m[14] = 0;
-  g[15] = 0x102000000000000;
-  m[15] = 0x100000000000000;
-
+  m[0] = DEC64E(hash->h8[0]);
+  m[1] = DEC64E(hash->h8[1]);
+  m[2] = DEC64E(hash->h8[2]);
+  m[3] = DEC64E(hash->h8[3]);
+  m[4] = DEC64E(hash->h8[4]);
+  m[5] = DEC64E(hash->h8[5]);
+  m[6] = DEC64E(hash->h8[6]);
+  m[7] = DEC64E(hash->h8[7]);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    g[u] = m[u] ^ H[u];
+  m[8] = 0x80; g[8] = m[8] ^ H[8];
+  m[9] = 0; g[9] = m[9] ^ H[9];
+  m[10] = 0; g[10] = m[10] ^ H[10];
+  m[11] = 0; g[11] = m[11] ^ H[11];
+  m[12] = 0; g[12] = m[12] ^ H[12];
+  m[13] = 0; g[13] = m[13] ^ H[13];
+  m[14] = 0; g[14] = m[14] ^ H[14];
+  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
  PERM_BIG_P(g);
  PERM_BIG_Q(m);

-  sph_u64 xH[16];
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u] ^= g[u] ^ m[u];
+    H[u] ^= g[u] ^ m[u];
+  sph_u64 xH[16];

+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    xH[u] = H[u];
  PERM_BIG_P(xH);

-  for (unsigned int u = 8; u < 16; u ++)
-    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= xH[u];

-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
+//#pragma unroll 8
+  for (unsigned int u = 0; u < 8; u ++)
+    hash->h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);

+}
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search3(__global hash_t* hashes)
 {
+
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
@ -1101,4 +1124,4 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
				@@ -1101,4 +1124,4 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
  output[atomic_inc(output+0xFF)] = SWAP4(gid);
 }

-#endif// DARKCOIN_MOD_CL
+#endif// DARKCOIN_MOD_CL
--- a/kernel/darkcoin.cl
+++ b/kernel/darkcoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -71,16 +71,9 @@ typedef long sph_s64;
				@@ -71,16 +71,9 @@ typedef long sph_s64;
 #define SPH_SMALL_FOOTPRINT_GROESTL 0
 #define SPH_GROESTL_BIG_ENDIAN 0
 #define SPH_CUBEHASH_UNROLL 0
-
-#ifndef SPH_COMPACT_BLAKE_64
-  #define SPH_COMPACT_BLAKE_64 0
-#endif
-#ifndef SPH_LUFFA_PARALLEL
-  #define SPH_LUFFA_PARALLEL 0
-#endif
-#ifndef SPH_KECCAK_UNROLL
-  #define SPH_KECCAK_UNROLL   0
-#endif
+#define SPH_COMPACT_BLAKE_64 0
+#define SPH_LUFFA_PARALLEL 0
+#define SPH_KECCAK_UNROLL   0

 #include "blake.cl"
 #include "bmw.cl"
@ -736,4 +729,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -736,4 +729,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
  }
 }

-#endif // DARKCOIN_CL
+#endif // DARKCOIN_CL
--- a/kernel/diamond.cl
+++ b/kernel/diamond.cl
--- a/kernel/groestl.cl
+++ b/kernel/groestl.cl
@ -1429,4 +1429,4 @@ __constant static const sph_u64 T7[] = {
				@@ -1429,4 +1429,4 @@ __constant static const sph_u64 T7[] = {
    for (r = 0; r < 14; ++r) { \
      ROUND_BIG_Q(a, r); \
    } \
-  } while (0)
+  } while (0)
--- a/kernel/groestlcoin.cl
+++ b/kernel/groestlcoin.cl
--- a/kernel/inkcoin.cl
+++ b/kernel/inkcoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -29,8 +29,8 @@
				@@ -29,8 +29,8 @@
 * @author   phm <phm@inbox.com>
 */

-#ifndef DARKCOIN_CL
-#define DARKCOIN_CL
+#ifndef INKCOIN_CL
+#define INKCOIN_CL

 #if __ENDIAN_LITTLE__
 #define SPH_LITTLE_ENDIAN 1
@ -54,34 +54,15 @@ typedef long sph_s64;
				@@ -54,34 +54,15 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

-#define SPH_ECHO_64 1
-#define SPH_KECCAK_64 1
-#define SPH_JH_64 1
-#define SPH_SIMD_NOCOPY 0
-#define SPH_KECCAK_NOCOPY 0
-#define SPH_SMALL_FOOTPRINT_GROESTL 0
-#define SPH_GROESTL_BIG_ENDIAN 0
-#define SPH_CUBEHASH_UNROLL 0
-
-#ifndef SPH_COMPACT_BLAKE_64
-  #define SPH_COMPACT_BLAKE_64 0
-#endif
-#ifndef SPH_LUFFA_PARALLEL
-  #define SPH_LUFFA_PARALLEL 0
-#endif
-#ifndef SPH_KECCAK_UNROLL
-  #define SPH_KECCAK_UNROLL   0
-#endif
-
 #include "shavite.cl"

 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -97,6 +78,14 @@ typedef long sph_s64;
				@@ -97,6 +78,14 @@ typedef long sph_s64;
  #define DEC32LE(x) (*(const __global sph_u32 *) (x));
 #endif

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+                    q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+                    q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+                    q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 // __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
 {
@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    output[output[0xFF]++] = SWAP4(gid);
 }

-#endif // DARKCOIN_CL
+#endif // INKCOIN_CL
--- a/kernel/marucoin-mod.cl
+++ b/kernel/marucoin-mod.cl
@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes)
				@@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
-
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_L[i] = T0[i];
-    T4_L[i] = T4[i];
-    T1_L[i] = T1[i];
-    T2_L[i] = T2[i];
-    T3_L[i] = T3[i];
-    T5_L[i] = T5[i];
-    T6_L[i] = T6[i];
-    T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  #define T0 T0_L
-  #define T1 T1_L
-  #define T2 T2_L
-  #define T3 T3_L
-  #define T4 T4_L
-  #define T5 T5_L
-  #define T6 T6_L
-  #define T7 T7_L
-
-  // groestl
-  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C
+
+
+  sph_u64 H[16];
+//#pragma unroll 15
+  for (unsigned int u = 0; u < 15; u ++)
+    H[u] = 0;
+#if USE_LE
+  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
+#else
+  H[15] = (sph_u64)512;
+#endif

  sph_u64 g[16], m[16];
-  g[0] = m[0] = DEC64E(hash->h8[0]);
-  g[1] = m[1] = DEC64E(hash->h8[1]);
-  g[2] = m[2] = DEC64E(hash->h8[2]);
-  g[3] = m[3] = DEC64E(hash->h8[3]);
-  g[4] = m[4] = DEC64E(hash->h8[4]);
-  g[5] = m[5] = DEC64E(hash->h8[5]);
-  g[6] = m[6] = DEC64E(hash->h8[6]);
-  g[7] = m[7] = DEC64E(hash->h8[7]);
-  g[8] = m[8] = 0x80;
-  g[9] = m[9] = 0;
-  g[10] = m[10] = 0;
-  g[11] = m[11] = 0;
-  g[12] = m[12] = 0;
-  g[13] = m[13] = 0;
-  g[14] = m[14] = 0;
-  g[15] = 0x102000000000000;
-  m[15] = 0x100000000000000;
-
+  m[0] = DEC64E(hash->h8[0]);
+  m[1] = DEC64E(hash->h8[1]);
+  m[2] = DEC64E(hash->h8[2]);
+  m[3] = DEC64E(hash->h8[3]);
+  m[4] = DEC64E(hash->h8[4]);
+  m[5] = DEC64E(hash->h8[5]);
+  m[6] = DEC64E(hash->h8[6]);
+  m[7] = DEC64E(hash->h8[7]);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    g[u] = m[u] ^ H[u];
+  m[8] = 0x80; g[8] = m[8] ^ H[8];
+  m[9] = 0; g[9] = m[9] ^ H[9];
+  m[10] = 0; g[10] = m[10] ^ H[10];
+  m[11] = 0; g[11] = m[11] ^ H[11];
+  m[12] = 0; g[12] = m[12] ^ H[12];
+  m[13] = 0; g[13] = m[13] ^ H[13];
+  m[14] = 0; g[14] = m[14] ^ H[14];
+  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
  PERM_BIG_P(g);
  PERM_BIG_Q(m);

-  sph_u64 xH[16];
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u] ^= g[u] ^ m[u];
+    H[u] ^= g[u] ^ m[u];
+  sph_u64 xH[16];

+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    xH[u] = H[u];
  PERM_BIG_P(xH);

-  for (unsigned int u = 8; u < 16; u ++)
-    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= xH[u];
+
+//#pragma unroll 8
+  for (unsigned int u = 0; u < 8; u ++)
+    hash->h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);

-  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
@ -1266,4 +1289,4 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
				@@ -1266,4 +1289,4 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
  barrier(CLK_GLOBAL_MEM_FENCE);
 }

-#endif // X13MOD_CL
+#endif // X13MOD_CL
--- a/kernel/marucoin.cl
+++ b/kernel/marucoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -862,4 +862,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -862,4 +862,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    output[output[0xFF]++] = SWAP4(gid);
 }

-#endif // MARUCOIN_CL
+#endif // MARUCOIN_CL
--- a/kernel/myriadcoin-groestl.cl
+++ b/kernel/myriadcoin-groestl.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -54,23 +54,19 @@ typedef long sph_s64;
				@@ -54,23 +54,19 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

 #define SPH_ECHO_64 1
 #define SPH_SIMD_NOCOPY 0
 #define SPH_CUBEHASH_UNROLL 0

-#ifndef SPH_LUFFA_PARALLEL
-  #define SPH_LUFFA_PARALLEL 0
-#endif
-
 #include "groestl.cl"

 #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -84,6 +80,14 @@ typedef long sph_s64;
				@@ -84,6 +80,14 @@ typedef long sph_s64;
  #define DEC64E(x) (*(const __global sph_u64 *) (x));
 #endif

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+                    q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+                    q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+                    q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 #define ROL32(x, n)  rotate(x, (uint) n)
 #define SHR(x, n)    ((x) >> n)
 #define SWAP32(a)    (as_uint(as_uchar4(a).wzyx))
@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    ulong h8[8];
  } hash;

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);
+
  for (int i = init; i < 256; i += step)
  {
-    T0_L[i] = T0[i];
-    T1_L[i] = T1[i];
-    T2_L[i] = T2[i];
-    T3_L[i] = T3[i];
-    T4_L[i] = T4[i];
-    T5_L[i] = T5[i];
-    T6_L[i] = T6[i];
-    T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-#define T0 T0_L
-#define T1 T1_L
-#define T2 T2_L
-#define T3 T3_L
-#define T4 T4_L
-#define T5 T5_L
-#define T6 T6_L
-#define T7 T7_L
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C

-  // groestl

  sph_u64 H[16];
+//#pragma unroll 15
  for (unsigned int u = 0; u < 15; u ++)
    H[u] = 0;
 #if USE_LE
@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
  m[13] = 0;
  m[14] = 0;
  m[15] = 0x100000000000000;
+
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
    g[u] = m[u] ^ H[u];
+
  PERM_BIG_P(g);
  PERM_BIG_Q(m);
+
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
    H[u] ^= g[u] ^ m[u];
  sph_u64 xH[16];
+
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
    xH[u] = H[u];
  PERM_BIG_P(xH);
+
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
    H[u] ^= xH[u];
+
+//#pragma unroll 8
  for (unsigned int u = 0; u < 8; u ++)
    hash.h8[u] = ENC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
  uint temp1;
  uint W0 = SWAP32(hash.h4[0x0]);
  uint W1 = SWAP32(hash.h4[0x1]);
@ -396,4 +420,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -396,4 +420,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    output[output[0xFF]++] = SWAP4(gid);
 }

-#endif // MYRIADCOIN_GROESTL_CL
+#endif // MYRIADCOIN_GROESTL_CL
--- a/kernel/neoscrypt.cl
+++ b/kernel/neoscrypt.cl
--- a/kernel/quarkcoin.cl
+++ b/kernel/quarkcoin.cl
--- a/kernel/qubitcoin.cl
+++ b/kernel/qubitcoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -54,13 +54,13 @@ typedef long sph_s64;
				@@ -54,13 +54,13 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

 #define SPH_ECHO_64 1
@ -88,6 +88,14 @@ typedef long sph_s64;
				@@ -88,6 +88,14 @@ typedef long sph_s64;
  #define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x));
 #endif

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+                    q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+                    q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+                    q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
 {
@ -465,4 +473,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -465,4 +473,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
  }
 }

-#endif // QUBITCOIN_CL
+#endif // QUBITCOIN_CL
--- a/kernel/sifcoin.cl
+++ b/kernel/sifcoin.cl
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014  phm
- * 
+ *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
@ -12,10 +12,10 @@
				@@ -12,10 +12,10 @@
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@ -54,13 +54,13 @@ typedef long sph_s64;
				@@ -54,13 +54,13 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

 #define SPH_ECHO_64 1
@ -100,6 +100,14 @@ typedef long sph_s64;
				@@ -100,6 +100,14 @@ typedef long sph_s64;
  #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
 #endif

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+                    q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+                    q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+                    q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
 {
@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    for(unsigned u = 0; u < 16; u++)
      BMW_H[u] = BMW_IV512[u];

-    sph_u64 BMW_h1[16], BMW_h2[16];
-    sph_u64 mv[16];
+    sph_u64 mv[16],q[32];
+      sph_u64 tmp;

    mv[ 0] = SWAP8(hash.h8[0]);
    mv[ 1] = SWAP8(hash.h8[1]);
@ -183,78 +191,330 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -183,78 +191,330 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    mv[13] = 0;
    mv[14] = 0;
    mv[15] = 0x200;
-#define M(x)    (mv[x])
-#define H(x)    (BMW_H[x])
-#define dH(x)   (BMW_h2[x])
-
-    FOLDb;
-
-#undef M
-#undef H
-#undef dH
-
-#define M(x)    (BMW_h2[x])
-#define H(x)    (final_b[x])
-#define dH(x)   (BMW_h1[x])
-
-    FOLDb;
-
-#undef M
-#undef H
-#undef dH
-
-    hash.h8[0] = SWAP8(BMW_h1[8]);
-    hash.h8[1] = SWAP8(BMW_h1[9]);
-    hash.h8[2] = SWAP8(BMW_h1[10]);
-    hash.h8[3] = SWAP8(BMW_h1[11]);
-    hash.h8[4] = SWAP8(BMW_h1[12]);
-    hash.h8[5] = SWAP8(BMW_h1[13]);
-    hash.h8[6] = SWAP8(BMW_h1[14]);
-    hash.h8[7] = SWAP8(BMW_h1[15]);
+
+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+  sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+#pragma unroll 16
+  for(int i=0;i<16;i++)
+  {
+  mv[i] = BMW_H[i];
+  BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
  }

+  tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
+  q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
+  tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
+  q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
+  tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
+  q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
+  q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
+  q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
+  tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
+  q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
+  tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
+  q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
+  tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
+  q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
+  tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
+  q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
+  q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
+  tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
+  q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
+  tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
+  q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
+  tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
+  q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
+  tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
+  q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
+  tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
+  q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
+
+#pragma unroll 2
+  for(int i=0;i<2;i++)
+  {
+  q[i+16] =
+    (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
+    (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
+    (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
+    (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
+    (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
+    (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
+    (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
+    (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
+    (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
+    (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
+    (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
+    (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
+    (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
+    (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
+    (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
+    (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=2;i<6;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 3
+  for(int i=6;i<9;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
+  }
+
+#pragma unroll 4
+  for(int i=9;i<13;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+#pragma unroll 3
+  for(int i=13;i<16;i++)
+  {
+  q[i+16] = CONST_EXP2 +
+    (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
+    SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
+  }
+
+  XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+  XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+  BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
+  BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
+  BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
+  BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
+  BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
+  BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
+  BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
+  BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
+
+  BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
+  BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
+  BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+  BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+  BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+  BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+  BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+  BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+
+  hash.h8[0] = SWAP8(BMW_H[8]);
+  hash.h8[1] = SWAP8(BMW_H[9]);
+  hash.h8[2] = SWAP8(BMW_H[10]);
+  hash.h8[3] = SWAP8(BMW_H[11]);
+  hash.h8[4] = SWAP8(BMW_H[12]);
+  hash.h8[5] = SWAP8(BMW_H[13]);
+  hash.h8[6] = SWAP8(BMW_H[14]);
+  hash.h8[7] = SWAP8(BMW_H[15]);
+  }
  // groestl
  {
-    sph_u64 H[16];
-    for (unsigned int u = 0; u < 15; u ++)
-      H[u] = 0;
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
+  int init = get_local_id(0);
+  int step = get_local_size(0);
+
+  for (int i = init; i < 256; i += step)
+  {
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C
+
+
+  sph_u64 H[16];
+//#pragma unroll 15
+  for (unsigned int u = 0; u < 15; u ++)
+    H[u] = 0;
 #if USE_LE
-    H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
+  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
 #else
-    H[15] = (sph_u64)512;
+  H[15] = (sph_u64)512;
 #endif

-    sph_u64 g[16], m[16];
-    m[0] = DEC64E(hash.h8[0]);
-    m[1] = DEC64E(hash.h8[1]);
-    m[2] = DEC64E(hash.h8[2]);
-    m[3] = DEC64E(hash.h8[3]);
-    m[4] = DEC64E(hash.h8[4]);
-    m[5] = DEC64E(hash.h8[5]);
-    m[6] = DEC64E(hash.h8[6]);
-    m[7] = DEC64E(hash.h8[7]);
-    for (unsigned int u = 0; u < 16; u ++)
-      g[u] = m[u] ^ H[u];
-    m[8] = 0x80; g[8] = m[8] ^ H[8];
-    m[9] = 0; g[9] = m[9] ^ H[9];
-    m[10] = 0; g[10] = m[10] ^ H[10];
-    m[11] = 0; g[11] = m[11] ^ H[11];
-    m[12] = 0; g[12] = m[12] ^ H[12];
-    m[13] = 0; g[13] = m[13] ^ H[13];
-    m[14] = 0; g[14] = m[14] ^ H[14];
-    m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
-    PERM_BIG_P(g);
-    PERM_BIG_Q(m);
-    for (unsigned int u = 0; u < 16; u ++)
-      H[u] ^= g[u] ^ m[u];
-    sph_u64 xH[16];
-    for (unsigned int u = 0; u < 16; u ++)
-      xH[u] = H[u];
-    PERM_BIG_P(xH);
-    for (unsigned int u = 0; u < 16; u ++)
-      H[u] ^= xH[u];
-    for (unsigned int u = 0; u < 8; u ++)
-      hash.h8[u] = DEC64E(H[u + 8]);
+  sph_u64 g[16], m[16];
+  m[0] = DEC64E(hash.h8[0]);
+  m[1] = DEC64E(hash.h8[1]);
+  m[2] = DEC64E(hash.h8[2]);
+  m[3] = DEC64E(hash.h8[3]);
+  m[4] = DEC64E(hash.h8[4]);
+  m[5] = DEC64E(hash.h8[5]);
+  m[6] = DEC64E(hash.h8[6]);
+  m[7] = DEC64E(hash.h8[7]);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    g[u] = m[u] ^ H[u];
+  m[8] = 0x80; g[8] = m[8] ^ H[8];
+  m[9] = 0; g[9] = m[9] ^ H[9];
+  m[10] = 0; g[10] = m[10] ^ H[10];
+  m[11] = 0; g[11] = m[11] ^ H[11];
+  m[12] = 0; g[12] = m[12] ^ H[12];
+  m[13] = 0; g[13] = m[13] ^ H[13];
+  m[14] = 0; g[14] = m[14] ^ H[14];
+  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
+  PERM_BIG_P(g);
+  PERM_BIG_Q(m);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= g[u] ^ m[u];
+  sph_u64 xH[16];
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    xH[u] = H[u];
+  PERM_BIG_P(xH);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= xH[u];
+
+//#pragma unroll 8
+  for (unsigned int u = 0; u < 8; u ++)
+    hash.h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);
  }

  // jh
@ -374,4 +634,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -374,4 +634,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    output[output[0xFF]++] = SWAP4(gid);
 }

-#endif // SIFCOIN_CL
+#endif // SIFCOIN_CL
--- a/kernel/talkcoin-mod.cl
+++ b/kernel/talkcoin-mod.cl
@ -68,10 +68,7 @@ typedef int sph_s32;
				@@ -68,10 +68,7 @@ typedef int sph_s32;
 #define SPH_JH_64 1
 #define SPH_KECCAK_64 1
 #define SPH_KECCAK_NOCOPY 0
-
-#ifndef SPH_COMPACT_BLAKE_64
-  #define SPH_COMPACT_BLAKE_64 0
-#endif
+#define SPH_COMPACT_BLAKE_64 0
 #ifndef SPH_KECCAK_UNROLL
  #define SPH_KECCAK_UNROLL   0
 #endif
@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes)
				@@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
-
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-  T0_L[i] = T0[i];
-  T1_L[i] = T1[i];
-  T2_L[i] = T2[i];
-  T3_L[i] = T3[i];
-  T4_L[i] = T4[i];
-  T5_L[i] = T5[i];
-  T6_L[i] = T6[i];
-  T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C

-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  #define T0 T0_L
-  #define T1 T1_L
-  #define T2 T2_L
-  #define T3 T3_L
-  #define T4 T4_L
-  #define T5 T5_L
-  #define T6 T6_L
-  #define T7 T7_L

  sph_u64 H[16];
-
+//#pragma unroll 15
  for (unsigned int u = 0; u < 15; u ++)
-  H[u] = 0;
-
-  #if USE_LE
+    H[u] = 0;
+#if USE_LE
  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
-  #else
+#else
  H[15] = (sph_u64)512;
-  #endif
+#endif

  sph_u64 g[16], m[16];
  m[0] = DEC64E(hash->h8[0]);
@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes)
				@@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes)
  m[6] = DEC64E(hash->h8[6]);
  m[7] = DEC64E(hash->h8[7]);

+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-  g[u] = m[u] ^ H[u];
-
+    g[u] = m[u] ^ H[u];
  m[8] = 0x80; g[8] = m[8] ^ H[8];
  m[9] = 0; g[9] = m[9] ^ H[9];
  m[10] = 0; g[10] = m[10] ^ H[10];
@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes)
				@@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes)
  m[13] = 0; g[13] = m[13] ^ H[13];
  m[14] = 0; g[14] = m[14] ^ H[14];
  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
-
  PERM_BIG_P(g);
  PERM_BIG_Q(m);

+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-  H[u] ^= g[u] ^ m[u];
-
+    H[u] ^= g[u] ^ m[u];
  sph_u64 xH[16];

+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-  xH[u] = H[u];
-
+    xH[u] = H[u];
  PERM_BIG_P(xH);

+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-  H[u] ^= xH[u];
+    H[u] ^= xH[u];

+//#pragma unroll 8
  for (unsigned int u = 0; u < 8; u ++)
-  hash->h8[u] = DEC64E(H[u + 8]);
+    hash->h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);

-  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 // jh
@ -398,4 +400,4 @@ __kernel void search4(__global hash_t* hashes, __global uint* output, const ulon
				@@ -398,4 +400,4 @@ __kernel void search4(__global hash_t* hashes, __global uint* output, const ulon
  output[atomic_inc(output+0xFF)] = SWAP4(gid);
 }

-#endif // TALKCOIN_MOD_CL
+#endif // TALKCOIN_MOD_CL
--- a/kernel/twecoin.cl
+++ b/kernel/twecoin.cl
@ -20,13 +20,13 @@ typedef long sph_s64;
				@@ -20,13 +20,13 @@ typedef long sph_s64;
 #define SPH_64_TRUE 1

 #define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

 #define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

 #define SPH_HAMSI_EXPAND_SMALL 1
@ -47,6 +47,14 @@ typedef long sph_s64;
				@@ -47,6 +47,14 @@ typedef long sph_s64;

 #define sph_bswap32(x) SWAP4(x)

+#define SHL(x, n) ((x) << (n))
+#define SHR(x, n) ((x) >> (n))
+
+#define CONST_EXP2  q[i+0] + SPH_ROTL64(q[i+1], 5)  + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
+                    q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
+                    q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
+                    q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
 static void sph_enc32be(void *dst, sph_u32 val)
 {
 #if defined SPH_UPTR
@ -425,4 +433,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
				@@ -425,4 +433,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
    if (result)
      output[output[0xFF]++] = SWAP4(gid);
  }
-}
+}
--- a/kernel/whirlcoin.cl
+++ b/kernel/whirlcoin.cl
--- a/kernel/x14.cl
+++ b/kernel/x14.cl
@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes)
				@@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes)
  uint gid = get_global_id(0);
  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

-  __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
-
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+  __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
+  __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
+#else
+  __local sph_u64 T0_C[256], T4_C[256];
+#endif
  int init = get_local_id(0);
  int step = get_local_size(0);

  for (int i = init; i < 256; i += step)
  {
-    T0_L[i] = T0[i];
-    T4_L[i] = T4[i];
-    T1_L[i] = T1[i];
-    T2_L[i] = T2[i];
-    T3_L[i] = T3[i];
-    T5_L[i] = T5[i];
-    T6_L[i] = T6[i];
-    T7_L[i] = T7[i];
+    T0_C[i] = T0[i];
+    T4_C[i] = T4[i];
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+    T1_C[i] = T1[i];
+    T2_C[i] = T2[i];
+    T3_C[i] = T3[i];
+    T5_C[i] = T5[i];
+    T6_C[i] = T6[i];
+    T7_C[i] = T7[i];
+#endif
  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  #define T0 T0_L
-  #define T1 T1_L
-  #define T2 T2_L
-  #define T3 T3_L
-  #define T4 T4_L
-  #define T5 T5_L
-  #define T6 T6_L
-  #define T7 T7_L
-
-  // groestl
-  sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
+  barrier(CLK_LOCAL_MEM_FENCE);    // groestl
+#define T0 T0_C
+#define T1 T1_C
+#define T2 T2_C
+#define T3 T3_C
+#define T4 T4_C
+#define T5 T5_C
+#define T6 T6_C
+#define T7 T7_C
+
+
+  sph_u64 H[16];
+//#pragma unroll 15
+  for (unsigned int u = 0; u < 15; u ++)
+    H[u] = 0;
+#if USE_LE
+  H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
+#else
+  H[15] = (sph_u64)512;
+#endif

  sph_u64 g[16], m[16];
-  g[0] = m[0] = DEC64E(hash->h8[0]);
-  g[1] = m[1] = DEC64E(hash->h8[1]);
-  g[2] = m[2] = DEC64E(hash->h8[2]);
-  g[3] = m[3] = DEC64E(hash->h8[3]);
-  g[4] = m[4] = DEC64E(hash->h8[4]);
-  g[5] = m[5] = DEC64E(hash->h8[5]);
-  g[6] = m[6] = DEC64E(hash->h8[6]);
-  g[7] = m[7] = DEC64E(hash->h8[7]);
-  g[8] = m[8] = 0x80;
-  g[9] = m[9] = 0;
-  g[10] = m[10] = 0;
-  g[11] = m[11] = 0;
-  g[12] = m[12] = 0;
-  g[13] = m[13] = 0;
-  g[14] = m[14] = 0;
-  g[15] = 0x102000000000000;
-  m[15] = 0x100000000000000;
-
+  m[0] = DEC64E(hash->h8[0]);
+  m[1] = DEC64E(hash->h8[1]);
+  m[2] = DEC64E(hash->h8[2]);
+  m[3] = DEC64E(hash->h8[3]);
+  m[4] = DEC64E(hash->h8[4]);
+  m[5] = DEC64E(hash->h8[5]);
+  m[6] = DEC64E(hash->h8[6]);
+  m[7] = DEC64E(hash->h8[7]);
+
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    g[u] = m[u] ^ H[u];
+  m[8] = 0x80; g[8] = m[8] ^ H[8];
+  m[9] = 0; g[9] = m[9] ^ H[9];
+  m[10] = 0; g[10] = m[10] ^ H[10];
+  m[11] = 0; g[11] = m[11] ^ H[11];
+  m[12] = 0; g[12] = m[12] ^ H[12];
+  m[13] = 0; g[13] = m[13] ^ H[13];
+  m[14] = 0; g[14] = m[14] ^ H[14];
+  m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
  PERM_BIG_P(g);
  PERM_BIG_Q(m);

-  sph_u64 xH[16];
+//#pragma unroll 16
  for (unsigned int u = 0; u < 16; u ++)
-    xH[u] = H[u] ^= g[u] ^ m[u];
+    H[u] ^= g[u] ^ m[u];
+  sph_u64 xH[16];

+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    xH[u] = H[u];
  PERM_BIG_P(xH);

-  for (unsigned int u = 8; u < 16; u ++)
-    hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
+//#pragma unroll 16
+  for (unsigned int u = 0; u < 16; u ++)
+    H[u] ^= xH[u];
+
+//#pragma unroll 8
+  for (unsigned int u = 0; u < 8; u ++)
+    hash->h8[u] = DEC64E(H[u + 8]);
+    barrier(CLK_GLOBAL_MEM_FENCE);

-  barrier(CLK_GLOBAL_MEM_FENCE);
 }

 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes)
				@@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes)
  sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
  sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

-  sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
+  sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

  rk00 = hash->h4[0];
  rk01 = hash->h4[1];
--- a/logging.c
+++ b/logging.c
@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...)
				@@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...)
 /* high-level logging function, based on global opt_log_level */
 void vapplogsiz(int prio, int size, const char* fmt, va_list args)
 {
-  if (opt_debug || prio != LOG_DEBUG) {
+  if ((opt_debug || prio != LOG_DEBUG)) {
    char *tmp42 = (char *)calloc(size + 1, 1);
    vsnprintf(tmp42, size, fmt, args);
    _applog(prio, tmp42, false);
    free(tmp42);
  }
+#ifdef DEV_DEBUG_MODE
+  else if(prio == LOG_DEBUG) {
+    char *tmp42 = (char *)calloc(size + 1, 1);
+    vsnprintf(tmp42, size, fmt, args);
+    __debug("", tmp42);
+    free(tmp42);
+  }
+#endif
 }

 /*
@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force)
				@@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force)
  if (0) {}
 #endif
  else {
+
+#ifdef DEV_DEBUG_MODE
+    if(prio == LOG_DEBUG) {
+      __debug("", str);
+    }
+#endif
+
    bool write_console = opt_debug_console || (opt_verbose && prio != LOG_DEBUG) || prio <= opt_log_level;
    bool write_stderr = !isatty(fileno((FILE *)stderr));
    if (!(write_console || write_stderr))
--- a/miner.h
+++ b/miner.h
@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE)
				@@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE)
  #define strtobool(str) ((str && (!strcasecmp(str, "true") || !strcasecmp(str, "yes") || !strcasecmp(str, "1")))?true:false)
 #endif

+extern int opt_remoteconf_retry;
+extern int opt_remoteconf_wait;
+extern bool opt_remoteconf_usecache;
+
+
 enum alive {
  LIFE_WELL,
  LIFE_SICK,
@ -1025,6 +1030,7 @@ extern char *sgminer_path;
				@@ -1025,6 +1030,7 @@ extern char *sgminer_path;
 extern int opt_shares;
 extern bool opt_fail_only;
 extern int opt_fail_switch_delay;
+extern int opt_watchpool_refresh;
 extern bool opt_autofan;
 extern bool opt_autoengine;
 extern bool use_curses;
@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond;
				@@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond;

 extern void clear_stratum_shares(struct pool *pool);
 extern void clear_pool_work(struct pool *pool);
-extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2);
-extern void set_target_neoscrypt(unsigned char *target, double diff);
+extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2, const int thr_id);
+extern void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id);

 extern void kill_work(void);

@ -1274,6 +1280,7 @@ struct pool {
				@@ -1274,6 +1280,7 @@ struct pool {
  bool remove_at_start;
  bool removed;
  bool lp_started;
+  bool backup;

  char *hdr_path;
  char *lp_url;
@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str);
				@@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str);
 extern int curses_int(const char *query);
 extern char *curses_input(const char *query);
 extern void kill_work(void);
-extern void switch_pools(struct pool *selected);
+
+//helper macro to preserve existing code
+#ifndef switch_pools
+  #define switch_pools(p) __switch_pools(p, TRUE)
+#endif
+extern void __switch_pools(struct pool *selected, bool saveprio);
+
 extern void discard_work(struct work *work);
 extern void remove_pool(struct pool *pool);
 //extern void write_config(FILE *fcfg);
--- a/ocl.c
+++ b/ocl.c
@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
				@@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
  char filename[255];
  char strbuf[32];

-  sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
+  sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
  strcpy(filename, strbuf);
+
  applog(LOG_DEBUG, "Using source file %s", filename);

  /* For some reason 2 vectors is still better even if the card says
@ -347,54 +348,111 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
				@@ -347,54 +348,111 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg

  // neoscrypt calculates TC differently
  if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) {
-    int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
-    size_t glob_thread_count = 1UL << max_int;
-
-    // if TC is entered by user, use that value... otherwise use default
-    cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count));
+    size_t glob_thread_count;
+    long max_int;
+    unsigned char type = 0;

-    // if TC * scratchbuf size is too big for memory... reduce to max
-    if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) {
-      /* Selected intensity will not run on this GPU. Not enough memory.
-      * Adapt the memory setting. */
-      glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
+    // determine which intensity type to use
+    // raw intensity is the same as TC so use either or setting...
+    if (cgpu->rawintensity > 0 || cgpu->opt_tc) {

-      /* Find highest significant bit in glob_thread_count, which gives
-      * the intensity. */
-      while (max_int && ((1U << max_int) & glob_thread_count) == 0) {
-        --max_int;
+      if (cgpu->opt_tc) {
+        glob_thread_count = cgpu->rawintensity = cgpu->opt_tc;
      }
-
-      /* Check if max_intensity is >0. */
-      if (max_int < MIN_INTENSITY) {
-        applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
-        max_int = MIN_INTENSITY;
+      else {
+        glob_thread_count = cgpu->rawintensity;
      }

-      cgpu->intensity = max_int;
-      cgpu->thread_concurrency = 1U << max_int;
+      max_int = glob_thread_count;
+      type = 2;
+    }
+    else if (cgpu->xintensity > 0) {
+      glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift)?(1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)):cgpu->xintensity);
+      max_int = cgpu->xintensity;
+      type = 1;
+    }
+    else {
+      glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
+      max_int = ((cgpu->dynamic)?MAX_INTENSITY:cgpu->intensity);
+    }
+
+    glob_thread_count = ((glob_thread_count < cgpu->work_size)?cgpu->work_size:glob_thread_count);
+
+    // if TC * scratchbuf size is too big for memory... reduce to max
+    if ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
+
+      /* Selected intensity will not run on this GPU. Not enough memory.
+       * Adapt the memory setting. */
+      // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
+      switch (type) {
+        //raw intensity
+        case 2:
+          while ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
+            --glob_thread_count;
+          }
+
+          max_int = glob_thread_count;
+          cgpu->rawintensity = glob_thread_count;
+          break;
+
+        //x intensity
+        case 1:
+          glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
+          max_int = glob_thread_count / clState->compute_shaders;
+
+          while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
+            --max_int;
+          }
+
+          /* Check if max_intensity is >0. */
+          if (max_int < MIN_XINTENSITY) {
+            applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
+            max_int = MIN_XINTENSITY;
+          }
+
+          cgpu->xintensity = max_int;
+          glob_thread_count = clState->compute_shaders * (1UL << max_int);
+          break;
+
+        default:
+          glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
+          while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
+            --max_int;
+          }
+
+          /* Check if max_intensity is >0. */
+          if (max_int < MIN_INTENSITY) {
+            applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
+            max_int = MIN_INTENSITY;
+          }
+
+          cgpu->intensity = max_int;
+          glob_thread_count = 1UL << max_int;
+          break;
+      }
    }

+    // TC is glob thread count
+    cgpu->thread_concurrency = glob_thread_count;
+
    applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));

-  }
-  else if (!cgpu->opt_tc) {
+  } else if (!cgpu->opt_tc) {
    unsigned int sixtyfours;

    sixtyfours =  cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
    cgpu->thread_concurrency = sixtyfours * 64;
    if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
      cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
+
      if (cgpu->thread_concurrency > cgpu->shaders * 5) {
        cgpu->thread_concurrency = cgpu->shaders * 5;
      }
    }
    applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
+  } else {
+      cgpu->thread_concurrency = cgpu->opt_tc;
  }
-  else {
-    cgpu->thread_concurrency = cgpu->opt_tc;
-  }
-

  cl_uint slot, cpnd;

@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
				@@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
  build_data->opencl_version = get_opencl_version(devices[gpu]);
  build_data->patch_bfi = needs_bfi_patch(build_data);

-  strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
+  strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
  strcat(build_data->binary_filename, name);
  if (clState->goffset)
    strcat(build_data->binary_filename, "g");
@ -491,20 +549,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
				@@ -491,20 +549,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
      bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;

      /* This is the input buffer. For neoscrypt this is guaranteed to be
-      * 80 bytes only. */
+       * 80 bytes only. */
      readbufsize = 80;

      applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
-      // scrypt/n-scrypt
-    }
-    else {
+    // scrypt/n-scrypt
+    } else {
      size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
      bufsize = 128 * ipt * cgpu->thread_concurrency;
      applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
    }
-  }
-  else {
-    bufsize = (size_t)algorithm->rw_buffer_size;
+  } else {
+    bufsize = (size_t) algorithm->rw_buffer_size;
    applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
  }

@ -536,10 +592,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
				@@ -536,10 +592,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
    applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
    return NULL;
  }
-  
+
  applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE);
  clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
-
  if (status != CL_SUCCESS) {
    applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
    return NULL;
--- a/pool.c
+++ b/pool.c
@ -33,18 +33,21 @@
				@@ -33,18 +33,21 @@
 #include <string.h>

 char* get_pool_name(struct pool *pool) {
-    if (opt_incognito) 
-	return "<pool>";
+  if (opt_incognito) {
+    return "<pool>";
+  }

-    if (strcmp(pool->name, "") == 0)
-	return pool->sockaddr_url;
+  if (empty_string(pool->name)) {
+    return pool->sockaddr_url;
+  }

-    return pool->name;
+  return pool->name;
 }

 char* get_pool_user(struct pool *pool) {
-    if (opt_incognito)
-	return "<user>";
+  if (opt_incognito) {
+    return "<user>";
+  }

-    return pool->rpc_user;
+  return pool->rpc_user;
 }
--- a/sgminer.c
+++ b/sgminer.c
--- a/util.c
+++ b/util.c
@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool)
				@@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool)
  }

  buflen = strlen(pool->sockbuf);
-  tok = strtok(pool->sockbuf, "\n");
-  if (!tok) {
-    applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line");
+
+  if ((tok = strtok(pool->sockbuf, "\n")) == NULL) {
+    applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line: buffer = %s", pool->sockbuf);
    goto out;
  }
  sret = strdup(tok);
@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val)
				@@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val)
  if (old_diff != diff) {
    int idiff = diff;

-    if ((double)idiff == diff)
+    if ((double)idiff == diff) {
      applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %d", get_pool_name(pool), idiff);
-    else
+    }
+    else {
      applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %.3f", get_pool_name(pool), diff);
-  } else
+    }
+  }
+  else {
    applog(LOG_DEBUG, "%s difficulty set to %f", get_pool_name(pool), diff);
+  }

  return true;
 }
@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s)
				@@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s)
  bool ret = false;
  char *buf;

-  if (!s)
+  if (!s) {
    return ret;
+  }

-  val = JSON_LOADS(s, &err);
-  if (!val) {
+  if (!(val = JSON_LOADS(s, &err))) {
    applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
    return ret;
  }

-  method = json_object_get(val, "method");
-  if (!method) {
-    json_decref(val);
-    return ret;
+  if (!(method = json_object_get(val, "method"))) {
+    goto done;
  }
+
  err_val = json_object_get(val, "error");
  params = json_object_get(val, "params");

  if (err_val && !json_is_null(err_val)) {
    char *ss;

-    if (err_val)
+    if (err_val) {
      ss = json_dumps(err_val, JSON_INDENT(3));
-    else
+    }
+    else {
      ss = strdup("(unknown reason)");
+    }

    applog(LOG_INFO, "JSON-RPC method decode failed: %s", ss);

-    json_decref(val);
    free(ss);
-
-    return ret;
+    goto done;
  }

  buf = (char *)json_string_value(method);
  if (!buf) {
-    json_decref(val);
-    return ret;
+    goto done;
  }

  if (!strncasecmp(buf, "mining.notify", 13)) {
-    if (parse_notify(pool, params))
+    if (parse_notify(pool, params)) {
      pool->stratum_notify = ret = true;
-    else
+    }
+    else {
      pool->stratum_notify = ret = false;
-    json_decref(val);
-    return ret;
+    }
+
+    goto done;
  }

  if (!strncasecmp(buf, "mining.set_difficulty", 21) && parse_diff(pool, params)) {
    ret = true;
-    json_decref(val);
-    return ret;
+    goto done;
  }

  if (!strncasecmp(buf, "mining.set_extranonce", 21) && parse_extranonce(pool, params)) {
    ret = true;
-    json_decref(val);
-    return ret;
+    goto done;
  }

  if (!strncasecmp(buf, "client.reconnect", 16) && parse_reconnect(pool, params)) {
    ret = true;
-    json_decref(val);
-    return ret;
+    goto done;
  }

  if (!strncasecmp(buf, "client.get_version", 18) && send_version(pool, val)) {
    ret = true;
-    json_decref(val);
-    return ret;
+    goto done;
  }

  if (!strncasecmp(buf, "client.show_message", 19) && show_message(pool, params)) {
    ret = true;
-    json_decref(val);
-    return ret;
+    goto done;
  }
+
+done:
  json_decref(val);
  return ret;
 }
@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool)
				@@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool)
  json_error_t err;
  bool ret = false;

-  sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}",
-    swork_id++);
+  sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", swork_id++);

-  if (!stratum_send(pool, s, strlen(s)))
+  if (!stratum_send(pool, s, strlen(s))) {
    return ret;
+  }

  /* Parse all data in the queue and anything left should be the response */
  while (42) {
@ -1910,12 +1911,15 @@ bool subscribe_extranonce(struct pool *pool)
				@@ -1910,12 +1911,15 @@ bool subscribe_extranonce(struct pool *pool)
    }

    sret = recv_line(pool);
-    if (!sret)
+    if (!sret) {
      return ret;
-    if (parse_method(pool, sret))
+    }
+    else if (parse_method(pool, sret)) {
      free(sret);
-    else
+    }
+    else {
      break;
+    }
  }

  val = JSON_LOADS(sret, &err);
@ -1968,18 +1972,23 @@ bool auth_stratum(struct pool *pool)
				@@ -1968,18 +1972,23 @@ bool auth_stratum(struct pool *pool)
  sprintf(s, "{\"id\": %d, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
    swork_id++, pool->rpc_user, pool->rpc_pass);

-  if (!stratum_send(pool, s, strlen(s)))
+  if (!stratum_send(pool, s, strlen(s))) {
    return ret;
+  }

  /* Parse all data in the queue and anything left should be auth */
  while (42) {
    sret = recv_line(pool);
-    if (!sret)
+
+    if (!sret) {
      return ret;
-    if (parse_method(pool, sret))
+    }
+    else if (parse_method(pool, sret)) {
      free(sret);
-    else
+    }
+    else {
      break;
+    }
  }

  val = JSON_LOADS(sret, &err);
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@ -267,6 +267,7 @@
				@@ -267,6 +267,7 @@
    <ClCompile Include="..\algorithm\talkcoin.c" />
    <ClCompile Include="..\algorithm\x14.c" />
    <ClCompile Include="..\algorithm\fresh.c" />
+    <ClCompile Include="..\algorithm\whirlcoin.c" />
    <ClCompile Include="..\api.c" />
    <ClCompile Include="..\ccan\opt\helpers.c" />
    <ClCompile Include="..\ccan\opt\opt.c" />
@ -327,6 +328,7 @@
				@@ -327,6 +328,7 @@
    <ClInclude Include="..\algorithm\talkcoin.h" />
    <ClInclude Include="..\algorithm\x14.h" />
    <ClInclude Include="..\algorithm\fresh.h" />
+    <ClInclude Include="..\algorithm\whirlcoin.h" />
    <ClInclude Include="..\api.h" />
    <ClInclude Include="..\arg-nonnull.h" />
    <ClInclude Include="..\bench_block.h" />
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
@ -185,6 +185,9 @@
				@@ -185,6 +185,9 @@
    <ClCompile Include="..\algorithm\fresh.c">
      <Filter>Source Files\algorithm</Filter>
    </ClCompile>
+    <ClCompile Include="..\algorithm\whirlcoin.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
    <ClCompile Include="..\sph\shabal.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
@ -373,6 +376,9 @@
				@@ -373,6 +376,9 @@
    <ClInclude Include="..\algorithm\fresh.h">
      <Filter>Header Files\algorithm</Filter>
    </ClInclude>
+    <ClInclude Include="..\algorithm\whirlcoin.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
    <ClInclude Include="..\sph\sph_whirlpool.h">
      <Filter>Header Files\sph</Filter>
    </ClInclude>