diff --git a/Makefile.am b/Makefile.am
index ac2ecdad..c36ea1db 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -73,10 +73,12 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
 sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
 sgminer_SOURCES += algorithm/whirlpoolx.c algorithm/whirlpoolx.h
 sgminer_SOURCES += algorithm/lyra2re.c algorithm/lyra2re.h algorithm/lyra2.c algorithm/lyra2.h algorithm/sponge.c algorithm/sponge.h
-sgminer_SOURCES += algorithm/lyra2re_old.c algorithm/lyra2re_old.h
+sgminer_SOURCES += algorithm/lyra2rev2.c algorithm/lyra2rev2.h
 sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h
 sgminer_SOURCES += algorithm/credits.c algorithm/credits.h
 sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h 
+sgminer_SOURCES += algorithm/blake256.c algorithm/blake256.h
+sgminer_SOURCES += algorithm/blakecoin.c algorithm/blakecoin.h
 
 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
 
diff --git a/algorithm.c b/algorithm.c
index 6acab924..8d436e17 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -33,10 +33,12 @@
 #include "algorithm/neoscrypt.h"
 #include "algorithm/whirlpoolx.h"
 #include "algorithm/lyra2re.h"
-#include "algorithm/lyra2re_old.h"
+#include "algorithm/lyra2rev2.h"
 #include "algorithm/pluck.h"
 #include "algorithm/yescrypt.h"
 #include "algorithm/credits.h"
+#include "algorithm/blake256.h"
+#include "algorithm/blakecoin.h"
 
 #include "compat.h"
 
@@ -62,10 +64,13 @@ const char *algorithm_type_str[] = {
   "Neoscrypt",
   "WhirlpoolX",
   "Lyra2RE",
-  "Lyra2REv2"
+  "Lyra2REV2"
   "Pluck"
   "Yescrypt",
-  "Yescrypt-multi"
+  "Yescrypt-multi",
+  "Blakecoin",
+  "Blake",
+  "Vanilla"
 };
 
 void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@@ -798,7 +803,7 @@ static cl_int queue_whirlpoolx_kernel(struct __clState *clState, struct _dev_blk
   return status;
 }
 
-static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+static cl_int queue_lyra2re_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel;
   unsigned int num;
@@ -842,7 +847,7 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct
   return status;
 }
 
-static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+static cl_int queue_lyra2rev2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel;
   unsigned int num;
@@ -915,10 +920,38 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un
   return status;
 }
 
+static cl_int queue_blake_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+	cl_kernel *kernel = &clState->kernel;
+	unsigned int num = 0;
+	cl_int status = 0;
+	cl_ulong le_target;
+
+	le_target = *(cl_ulong *)(blk->work->device_target + 24);
+	flip80(clState->cldata, blk->work->data);
+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+
+	CL_SET_ARG(clState->outputBuffer);
+	CL_SET_ARG(blk->work->blk.ctx_a);
+	CL_SET_ARG(blk->work->blk.ctx_b);
+	CL_SET_ARG(blk->work->blk.ctx_c);
+	CL_SET_ARG(blk->work->blk.ctx_d);
+	CL_SET_ARG(blk->work->blk.ctx_e);
+	CL_SET_ARG(blk->work->blk.ctx_f);
+	CL_SET_ARG(blk->work->blk.ctx_g);
+	CL_SET_ARG(blk->work->blk.ctx_h);
+
+	CL_SET_ARG(blk->work->blk.cty_a);
+	CL_SET_ARG(blk->work->blk.cty_b);
+	CL_SET_ARG(blk->work->blk.cty_c);
+
+	return status;
+}
+
 static algorithm_settings_t algos[] = {
   // kernels starting from this will have difficulty calculated by using litecoin algorithm
 #define A_SCRYPT(a) \
-  { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options }
+  { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, NULL, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options }
   A_SCRYPT("ckolivas"),
   A_SCRYPT("alexkarnew"),
   A_SCRYPT("alexkarnold"),
@@ -929,36 +962,33 @@ static algorithm_settings_t algos[] = {
 #undef A_SCRYPT
 
 #define A_NEOSCRYPT(a) \
-  { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options }
+  { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, NULL, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options }
   A_NEOSCRYPT("neoscrypt"),
 #undef A_NEOSCRYPT
 
 #define A_PLUCK(a) \
-  { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options }
+  { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, NULL, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options }
   A_PLUCK("pluck"),
 #undef A_PLUCK
 
 #define A_CREDITS(a) \
-  { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL}
+  { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, NULL, queue_credits_kernel, gen_hash, NULL}
   A_CREDITS("credits"),
 #undef A_CREDITS
 
-
-
 #define A_YESCRYPT(a) \
-  { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
+  { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, NULL, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
   A_YESCRYPT("yescrypt"),
 #undef A_YESCRYPT
 
 #define A_YESCRYPT_MULTI(a) \
-  { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options}
+  { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, NULL, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options}
   A_YESCRYPT_MULTI("yescrypt-multi"),
 #undef A_YESCRYPT_MULTI
 
-
   // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
-  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
+  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }
   A_QUARK("quarkcoin", quarkcoin_regenhash),
   A_QUARK("qubitcoin", qubitcoin_regenhash),
   A_QUARK("animecoin", animecoin_regenhash),
@@ -967,46 +997,48 @@ static algorithm_settings_t algos[] = {
 
   // kernels starting from this will have difficulty calculated by using bitcoin algorithm
 #define A_DARK(a, b) \
-  { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
+  { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }
   A_DARK("darkcoin", darkcoin_regenhash),
   A_DARK("inkcoin", inkcoin_regenhash),
   A_DARK("myriadcoin-groestl", myriadcoin_groestl_regenhash),
 #undef A_DARK
 
-  { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL },
-  { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL },
-
-  { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
+  { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, NULL, queue_sph_kernel, sha256, NULL },
+  { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, NULL, queue_maxcoin_kernel, sha256, NULL },
 
-  { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options },
-  { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options },
-  { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options },
+  { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, NULL, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
 
-  { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options },
-  { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options },
+  { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, NULL, queue_sph_kernel, gen_hash, append_x13_compiler_options },
+  { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, NULL, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options },
+  { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, NULL, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options },
 
-  { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options },
-  { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options },
+  { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, NULL, queue_x14_kernel, gen_hash, append_x13_compiler_options },
+  { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, NULL, queue_x14_old_kernel, gen_hash, append_x13_compiler_options },
 
-  { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
+  { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, NULL, queue_bitblock_kernel, gen_hash, append_x13_compiler_options },
+  { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, NULL, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options },
 
-  { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL },
+  { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, NULL, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
 
-  { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL },
-
-  { "lyra2rev2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options },
+  { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, NULL, queue_fresh_kernel, gen_hash, NULL },
 
+  { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2re_regenhash, precalc_hash_blake256, queue_lyra2re_kernel, gen_hash, NULL },
+  { "lyra2rev2", ALGO_LYRA2REV2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2rev2_regenhash, precalc_hash_blake256, queue_lyra2rev2_kernel, gen_hash, append_neoscrypt_compiler_options },
 
   // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
 #define A_FUGUE(a, b, c) \
-  { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL }
+  { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, c, NULL }
   A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256),
   A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256),
   A_FUGUE("diamond", groestlcoin_regenhash, gen_hash),
 #undef A_FUGUE
 
-  { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL },
-  { "whirlpoolx", ALGO_WHIRLPOOLX, "", 1, 1, 1, 0, 0, 0xFFU, 0xFFFFULL, 0x0000FFFFUL, 0, 0, 0, whirlpoolx_regenhash, queue_whirlpoolx_kernel, gen_hash, NULL },
+  { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, NULL, queue_whirlcoin_kernel, sha256, NULL },
+  { "whirlpoolx", ALGO_WHIRLPOOLX, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000FFFFUL, 0, 0, 0, whirlpoolx_regenhash, NULL, queue_whirlpoolx_kernel, gen_hash, NULL },
+
+  { "blake256r8",  ALGO_BLAKECOIN, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x000000ffUL, 0, 128, 0, blakecoin_regenhash, precalc_hash_blakecoin, queue_blake_kernel, sha256,   NULL },
+  { "blake256r14", ALGO_BLAKE,     "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x00000000UL, 0, 128, 0, blake256_regenhash, precalc_hash_blake256, queue_blake_kernel, gen_hash, NULL },
+  { "vanilla",     ALGO_VANILLA,   "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x000000ffUL, 0, 128, 0, blakecoin_regenhash, precalc_hash_blakecoin, queue_blake_kernel, gen_hash, NULL },
 
   // Terminator (do not remove)
   { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL }
@@ -1019,7 +1051,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo)
   // Find algorithm settings and copy
   for (src = algos; src->name; src++)
   {
-    if (strcmp(src->name, algo) == 0)
+    if (strcasecmp(src->name, algo) == 0)
     {
       strcpy(dest->name, src->name);
       dest->kernelfile = src->kernelfile;
@@ -1037,6 +1069,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo)
       dest->rw_buffer_size = src->rw_buffer_size;
       dest->cq_properties = src->cq_properties;
       dest->regenhash = src->regenhash;
+      dest->precalc_hash = src->precalc_hash;
       dest->queue_kernel = src->queue_kernel;
       dest->gen_hash = src->gen_hash;
       dest->set_compile_options = src->set_compile_options;
@@ -1081,6 +1114,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
   ALGO_ALIAS("whirlpool", "whirlcoin");
   ALGO_ALIAS("lyra2", "lyra2re");
   ALGO_ALIAS("lyra2v2", "lyra2rev2");
+  ALGO_ALIAS("blakecoin", "blake256r8");
+  ALGO_ALIAS("blake", "blake256r14");
 
 #undef ALGO_ALIAS
 #undef ALGO_ALIAS_NF
diff --git a/algorithm.h b/algorithm.h
index 8b7185a4..afbd2f7a 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -30,10 +30,13 @@ typedef enum {
   ALGO_NEOSCRYPT,
   ALGO_WHIRLPOOLX,
   ALGO_LYRA2RE,
-  ALGO_LYRA2REv2,
+  ALGO_LYRA2REV2,
   ALGO_PLUCK,
   ALGO_YESCRYPT,
   ALGO_YESCRYPT_MULTI,
+  ALGO_BLAKECOIN,
+  ALGO_BLAKE,
+  ALGO_VANILLA
 } algorithm_type_t;
 
 extern const char *algorithm_type_str[];
@@ -67,6 +70,7 @@ typedef struct _algorithm_t {
   long rw_buffer_size;
   cl_command_queue_properties cq_properties;
   void(*regenhash)(struct work *);
+  void(*precalc_hash)(struct _dev_blk_ctx *, uint32_t *, uint32_t *);
   cl_int(*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
   void(*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
   void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *);
@@ -89,6 +93,7 @@ typedef struct _algorithm_settings_t
 	long rw_buffer_size;
 	cl_command_queue_properties cq_properties;
 	void     (*regenhash)(struct work *);
+	void     (*precalc_hash)(struct _dev_blk_ctx *, uint32_t *, uint32_t *);
 	cl_int   (*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
 	void     (*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
 	void     (*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *);
diff --git a/algorithm/blake256.c b/algorithm/blake256.c
new file mode 100644
index 00000000..ddd86122
--- /dev/null
+++ b/algorithm/blake256.c
@@ -0,0 +1,163 @@
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ *
+ * Modified for more speed by BlueDragon747 for the Blakecoin project
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include <stdint.h>
+
+#include "sph/sph_blake.h"
+#include "algorithm/blake256.h"
+
+/*
+* Encode a length len/4 vector of (uint32_t) into a length len vector of
+* (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+*/
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+  uint32_t i;
+
+  for (i = 0; i < len; i++)
+    dst[i] = htobe32(src[i]);
+}
+
+static const uint32_t diff1targ_blake256 = 0x000000ff;
+
+inline void blake256hash(void *state, const void *input)
+{
+  sph_blake256_context ctx_blake;
+  sph_blake256_init(&ctx_blake);
+  sph_blake256(&ctx_blake, input, 80);
+  sph_blake256_close(&ctx_blake, state);
+}
+
+void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata)
+{
+	sph_blake256_context ctx_blake;
+	uint32_t data[16];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 16);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, data, 64);
+
+	blk->ctx_a = ctx_blake.H[0];
+	blk->ctx_b = ctx_blake.H[1];
+	blk->ctx_c = ctx_blake.H[2];
+	blk->ctx_d = ctx_blake.H[3];
+	blk->ctx_e = ctx_blake.H[4];
+	blk->ctx_f = ctx_blake.H[5];
+	blk->ctx_g = ctx_blake.H[6];
+	blk->ctx_h = ctx_blake.H[7];
+
+	blk->cty_a = pdata[16];
+	blk->cty_b = pdata[17];
+	blk->cty_c = pdata[18];
+}
+
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+/* Used externally as confirmation of correct OCL code */
+int blake256_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	blake256hash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void blake256_regenhash(struct work *work)
+{
+        uint32_t data[20];
+        uint32_t *nonce = (uint32_t *)(work->data + 76);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        be32enc_vect(data, (const uint32_t *)work->data, 19);
+        data[19] = htobe32(*nonce);
+        blake256hash(ohash, data);
+}
+
+bool scanhash_blake256(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) {
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[19] = (n);
+		blake256hash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/algorithm/blake256.h b/algorithm/blake256.h
new file mode 100644
index 00000000..f021585d
--- /dev/null
+++ b/algorithm/blake256.h
@@ -0,0 +1,10 @@
+#ifndef BLAKE256_H
+#define BLAKE256_H
+
+#include "miner.h"
+
+extern int blake256_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata);
+extern void blake256_regenhash(struct work *work);
+
+#endif /* BLAKE256_H */
\ No newline at end of file
diff --git a/algorithm/blakecoin.c b/algorithm/blakecoin.c
new file mode 100644
index 00000000..6a118cef
--- /dev/null
+++ b/algorithm/blakecoin.c
@@ -0,0 +1,163 @@
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ *
+ * Modified for more speed by BlueDragon747 for the Blakecoin project
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include <stdint.h>
+
+#include "sph/sph_blake.h"
+#include "algorithm/blakecoin.h"
+
+/*
+* Encode a length len/4 vector of (uint32_t) into a length len vector of
+* (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+*/
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+  uint32_t i;
+
+  for (i = 0; i < len; i++)
+    dst[i] = htobe32(src[i]);
+}
+
+static const uint32_t diff1targ_blake256 = 0x000000ff;
+
+inline void blakecoinhash(void *state, const void *input)
+{
+  sph_blake256_context     ctx_blake;
+  sph_blake256_init(&ctx_blake);
+  sph_blake256r8(&ctx_blake, input, 80);
+  sph_blake256r8_close(&ctx_blake, state);
+}
+
+void precalc_hash_blakecoin(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata)
+{
+	sph_blake256_context ctx_blake;
+	uint32_t data[16];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 16);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256r8(&ctx_blake, data, 64);
+
+	blk->ctx_a = ctx_blake.H[0];
+	blk->ctx_b = ctx_blake.H[1];
+	blk->ctx_c = ctx_blake.H[2];
+	blk->ctx_d = ctx_blake.H[3];
+	blk->ctx_e = ctx_blake.H[4];
+	blk->ctx_f = ctx_blake.H[5];
+	blk->ctx_g = ctx_blake.H[6];
+	blk->ctx_h = ctx_blake.H[7];
+
+	blk->cty_a = pdata[16];
+	blk->cty_b = pdata[17];
+	blk->cty_c = pdata[18];
+}
+
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+/* Used externally as confirmation of correct OCL code */
+int blakecoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	blakecoinhash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void blakecoin_regenhash(struct work *work)
+{
+        uint32_t data[20];
+        uint32_t *nonce = (uint32_t *)(work->data + 76);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        be32enc_vect(data, (const uint32_t *)work->data, 19);
+        data[19] = htobe32(*nonce);
+        blakecoinhash(ohash, data);
+}
+
+bool scanhash_blakecoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) {
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[19] = (n);
+		blakecoinhash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/algorithm/blakecoin.h b/algorithm/blakecoin.h
new file mode 100644
index 00000000..dcaeff71
--- /dev/null
+++ b/algorithm/blakecoin.h
@@ -0,0 +1,10 @@
+#ifndef BLAKECOIN_H
+#define BLAKECOIN_H
+
+#include "miner.h"
+
+extern int blakecoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void precalc_hash_blakecoin(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
+extern void blakecoin_regenhash(struct work *work);
+
+#endif /* BLAKECOIN_H */
\ No newline at end of file
diff --git a/algorithm/lyra2.c b/algorithm/lyra2.c
index 42640e76..865d8e17 100644
--- a/algorithm/lyra2.c
+++ b/algorithm/lyra2.c
@@ -61,16 +61,18 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
     const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
     const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+    // for Lyra2REv2, nCols = 4, v1 was using 8
+    const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
 
     i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
-    uint64_t *wholeMatrix = malloc(i);
+	uint64_t *wholeMatrix = (uint64_t*)malloc(i);
     if (wholeMatrix == NULL) {
       return -1;
     }
 	memset(wholeMatrix, 0, i);
 
     //Allocates pointers to each row of the matrix
-    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+	uint64_t **memMatrix = (uint64_t**)malloc(nRows * sizeof (uint64_t*));
     if (memMatrix == NULL) {
       return -1;
     }
@@ -122,7 +124,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
     //======================= Initializing the Sponge State ====================//
     //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-    uint64_t *state = malloc(16 * sizeof (uint64_t));
+	uint64_t *state = (uint64_t*)malloc(16 * sizeof (uint64_t));
     if (state == NULL) {
       return -1;
     }
@@ -134,7 +136,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
     ptrWord = wholeMatrix;
     for (i = 0; i < nBlocksInput; i++) {
       absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
-      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
     }
 
     //Initializes M[0] and M[1]
@@ -196,7 +198,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
     absorbBlock(state, memMatrix[rowa]);
 
     //Squeezes the key
-    squeeze(state, K, kLen);
+    squeeze(state, (unsigned char*)K, kLen);
     //==========================================================================/
 
     //========================= Freeing the memory =============================//
diff --git a/algorithm/lyra2re.c b/algorithm/lyra2re.c
index cfc5adbb..61f2b34f 100644
--- a/algorithm/lyra2re.c
+++ b/algorithm/lyra2re.c
@@ -36,8 +36,6 @@
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_keccak.h" 
-#include "sph/sph_bmw.h"
-#include "sph/sph_cubehash.h"
 #include "lyra2.h"
 
 /*
@@ -57,10 +55,9 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
 inline void lyra2rehash(void *state, const void *input)
 {
     sph_blake256_context     ctx_blake;
-    sph_bmw256_context       ctx_bmw;
+    sph_groestl256_context   ctx_groestl;
     sph_keccak256_context    ctx_keccak;
     sph_skein256_context     ctx_skein;
-    sph_cubehash256_context  ctx_cube;
 
     uint32_t hashA[8], hashB[8];
 
@@ -72,23 +69,17 @@ inline void lyra2rehash(void *state, const void *input)
     sph_keccak256 (&ctx_keccak,hashA, 32);
     sph_keccak256_close(&ctx_keccak, hashB);
 
-    sph_cubehash256_init(&ctx_cube);
-    sph_cubehash256(&ctx_cube, hashB, 32);
-    sph_cubehash256_close(&ctx_cube, hashA);
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
 
-    LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
 
     sph_skein256_init(&ctx_skein);
-    sph_skein256 (&ctx_skein, hashB, 32);
-    sph_skein256_close(&ctx_skein, hashA);
+    sph_skein256 (&ctx_skein, hashA, 32);
+    sph_skein256_close(&ctx_skein, hashB);
 
-    sph_cubehash256_init(&ctx_cube);
-    sph_cubehash256(&ctx_cube, hashA, 32);
-    sph_cubehash256_close(&ctx_cube, hashB);
 
-    sph_bmw256_init(&ctx_bmw);
-    sph_bmw256 (&ctx_bmw, hashB, 32);
-    sph_bmw256_close(&ctx_bmw, hashA);
+    sph_groestl256_init(&ctx_groestl);
+    sph_groestl256 (&ctx_groestl, hashB, 32);
+    sph_groestl256_close(&ctx_groestl, hashA);
 
     memcpy(state, hashA, 32);
 }
diff --git a/algorithm/lyra2re.h b/algorithm/lyra2re.h
index 8bc52ac4..8a58e747 100644
--- a/algorithm/lyra2re.h
+++ b/algorithm/lyra2re.h
@@ -2,8 +2,6 @@
 #define LYRA2RE_H
 
 #include "miner.h"
-#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent
-#define LYRA_SECBUF_SIZE (4) // (not used)
 
 extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget,
 			uint32_t nonce);
diff --git a/algorithm/lyra2re_old.h b/algorithm/lyra2re_old.h
deleted file mode 100644
index 0788dfb3..00000000
--- a/algorithm/lyra2re_old.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef LYRA2REOLD_H
-#define LYRA2REOLD_H
-
-#include "miner.h"
-
-extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget,
-			uint32_t nonce);
-extern void lyra2reold_regenhash(struct work *work);
-
-#endif /* LYRA2RE_H */
diff --git a/algorithm/lyra2re_old.c b/algorithm/lyra2rev2.c
similarity index 77%
rename from algorithm/lyra2re_old.c
rename to algorithm/lyra2rev2.c
index 31a0a1ab..aea0082a 100644
--- a/algorithm/lyra2re_old.c
+++ b/algorithm/lyra2rev2.c
@@ -36,6 +36,8 @@
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_keccak.h" 
+#include "sph/sph_bmw.h"
+#include "sph/sph_cubehash.h"
 #include "lyra2.h"
 
 /*
@@ -52,13 +54,13 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
 }
 
 
-inline void lyra2rehash_old(void *state, const void *input)
+inline void lyra2rev2hash(void *state, const void *input)
 {
     sph_blake256_context     ctx_blake;
-    sph_groestl256_context   ctx_groestl;
+    sph_bmw256_context       ctx_bmw;
     sph_keccak256_context    ctx_keccak;
     sph_skein256_context     ctx_skein;
-
+    sph_cubehash256_context  ctx_cube;
     uint32_t hashA[8], hashB[8];
 
     sph_blake256_init(&ctx_blake);
@@ -69,32 +71,41 @@ inline void lyra2rehash_old(void *state, const void *input)
     sph_keccak256 (&ctx_keccak,hashA, 32);
     sph_keccak256_close(&ctx_keccak, hashB);
 
-    LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+	LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_skein256_init(&ctx_skein);
+    sph_skein256 (&ctx_skein, hashB, 32);
+    sph_skein256_close(&ctx_skein, hashA);
 
-    sph_skein256_init(&ctx_skein);
-    sph_skein256 (&ctx_skein, hashA, 32);
-    sph_skein256_close(&ctx_skein, hashB);
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
 
+    sph_bmw256_init(&ctx_bmw);
+    sph_bmw256 (&ctx_bmw, hashB, 32);
+    sph_bmw256_close(&ctx_bmw, hashA);
 
-    sph_groestl256_init(&ctx_groestl);
-    sph_groestl256 (&ctx_groestl, hashB, 32);
-    sph_groestl256_close(&ctx_groestl, hashA);
+//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);
 
-    memcpy(state, hashA, 32);
+	memcpy(state, hashA, 32);
 }
 
 static const uint32_t diff1targ = 0x0000ffff;
 
 
 /* Used externally as confirmation of correct OCL code */
-int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+int lyra2rev2_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
 {
 	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
 	uint32_t data[20], ohash[8];
 
 	be32enc_vect(data, (const uint32_t *)pdata, 19);
 	data[19] = htobe32(nonce);
-	lyra2rehash_old(ohash, data);
+	lyra2rev2hash(ohash, data);
 	tmp_hash7 = be32toh(ohash[7]);
 
 	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
@@ -108,7 +119,7 @@ int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t
 	return 1;
 }
 
-void lyra2reold_regenhash(struct work *work)
+void lyra2rev2_regenhash(struct work *work)
 {
         uint32_t data[20];
         uint32_t *nonce = (uint32_t *)(work->data + 76);
@@ -116,10 +127,10 @@ void lyra2reold_regenhash(struct work *work)
 
         be32enc_vect(data, (const uint32_t *)work->data, 19);
         data[19] = htobe32(*nonce);
-        lyra2rehash_old(ohash, data);
+        lyra2rev2hash(ohash, data);
 }
 
-bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+bool scanhash_lyra2rev2(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
 		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
 		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
 		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
@@ -137,7 +148,7 @@ bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unuse
 
 		*nonce = ++n;
 		data[19] = (n);
-		lyra2rehash_old(ostate, data);
+		lyra2rev2hash(ostate, data);
 		tmp_hash7 = (ostate[7]);
 
 		applog(LOG_INFO, "data7 %08lx",
diff --git a/algorithm/lyra2rev2.h b/algorithm/lyra2rev2.h
new file mode 100644
index 00000000..1a31f76f
--- /dev/null
+++ b/algorithm/lyra2rev2.h
@@ -0,0 +1,11 @@
+#ifndef LYRA2REV2_H
+#define LYRA2REV2_H
+
+#include "miner.h"
+#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent
+#define LYRA_SECBUF_SIZE (4) // (not used)
+extern int lyra2rev2_test(unsigned char *pdata, const unsigned char *ptarget,
+			uint32_t nonce);
+extern void lyra2rev2_regenhash(struct work *work);
+
+#endif /* LYRA2REV2_H */
diff --git a/algorithm/yescrypt-opt.c b/algorithm/yescrypt-opt.c
index b54be469..6adef7e7 100644
--- a/algorithm/yescrypt-opt.c
+++ b/algorithm/yescrypt-opt.c
@@ -99,7 +99,7 @@ alloc_region(yescrypt_region_t * region, size_t size)
 	if (size + 63 < size) {
 		errno = ENOMEM;
 	}
-	else if ((base = malloc(size + 63)) != NULL) {
+	else if ((base = (uint8_t *)malloc(size + 63)) != NULL) {
 		aligned = base + 63;
 		aligned -= (uintptr_t)aligned & 63;
 	}
@@ -520,7 +520,7 @@ smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
 	uint64_t * XY, uint64_t * S)
 {
 	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
+	const uint64_t * VROM = (uint64_t *)shared->shared1.aligned;
 	uint32_t VROM_mask = shared->mask1;
 	size_t s = 16 * r;
 	uint64_t * X = V;
@@ -671,7 +671,7 @@ smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
 	
 	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
 		(S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
+	const uint64_t * VROM = (uint64_t *)shared->shared1.aligned;
 	uint32_t VROM_mask = shared->mask1 | 1;
 	size_t s = 16 * r;
 	yescrypt_flags_t rw = flags & YESCRYPT_RW;
@@ -835,7 +835,7 @@ smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
 		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
 
 		if (Sp) 
-			smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL);
+			smix1(Bp, 1, S_SIZE_ALL / 16, (yescrypt_flags_t)flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL);
 
 	
 
diff --git a/api.c b/api.c
index 08701b8f..1efa3004 100644
--- a/api.c
+++ b/api.c
@@ -1334,7 +1334,7 @@ static void apiversion(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __m
   io_open = io_add(io_data, isjson ? COMSTR JSON_VERSION : _VERSION COMSTR);
 
   root = api_add_string(root, "Miner", PACKAGE " " VERSION, false);
-  root = api_add_string(root, "CGMiner", CGMINER_VERSION, false);
+  root = api_add_string(root, "SGMiner", CGMINER_VERSION, false);
   root = api_add_const(root, "API", APIVERSION, false);
 
   root = print_data(root, buf, isjson, false);
diff --git a/configure.ac b/configure.ac
index 681c2b9c..659ad843 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_maj], [5])
-m4_define([v_min], [2])
+m4_define([v_min], [3])
 m4_define([v_mic], [0])
 m4_define([v_rev], [nicehash])
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
diff --git a/driver-opencl.c b/driver-opencl.c
index 0e45e555..0e368782 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1366,13 +1366,8 @@ static bool opencl_thread_init(struct thr_info *thr)
 
 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
 {
-  if (work->pool->algorithm.type == ALGO_LYRA2RE || work->pool->algorithm.type == ALGO_LYRA2REv2) {
-    work->blk.work = work;
-    precalc_hash_blake256(&work->blk, 0, (uint32_t *)(work->data));
-  }
-  else {
-    work->blk.work = work;
-  }
+  work->blk.work = work;
+  if (work->pool->algorithm.precalc_hash) work->pool->algorithm.precalc_hash(&work->blk, 0, (uint32_t *)(work->data));
   thr->pool_no = work->pool->pool_no;
   return true;
 }
diff --git a/findnonce.c b/findnonce.c
index 8858cfa6..72dcaaef 100644
--- a/findnonce.c
+++ b/findnonce.c
@@ -234,136 +234,3 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
     free(pcd);
   }
 }
-
-// BLAKE 256 14 rounds (standard)
-
-typedef struct
-{
-  uint32_t h[8];
-  uint32_t t;
-} blake_state256;
-
-#define NB_ROUNDS32 14
-
-const uint8_t blake_sigma[][16] =
-{
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-  { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-  { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-  { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
-};
-
-const uint32_t blake_u256[16] =
-{
-  0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
-  0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
-  0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
-  0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
-};
-
-#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n)))
-//#define ROT32(x,n)   (rotate((uint)x, (uint)32-n))
-#define ADD32(x,y)   ((uint32_t)((x) + (y)))
-#define XOR32(x,y)   ((uint32_t)((x) ^ (y)))
-
-#define G(a,b,c,d,i) \
-do { \
-  v[a] += XOR32(m[blake_sigma[r][i]], blake_u256[blake_sigma[r][i + 1]]) + v[b]; \
-  v[d] = ROT32(XOR32(v[d], v[a]), 16); \
-  v[c] += v[d]; \
-  v[b] = ROT32(XOR32(v[b], v[c]), 12); \
-  v[a] += XOR32(m[blake_sigma[r][i + 1]], blake_u256[blake_sigma[r][i]]) + v[b]; \
-  v[d] = ROT32(XOR32(v[d], v[a]), 8); \
-  v[c] += v[d]; \
-  v[b] = ROT32(XOR32(v[b], v[c]), 7); \
-} while (0)
-
-// compress a block
-void blake256_compress_block(blake_state256 *S, uint32_t *m)
-{
-  uint32_t v[16];
-  int i, r;
-  for (i = 0; i < 8; ++i)  v[i] = S->h[i];
-
-  v[8] = blake_u256[0];
-  v[9] = blake_u256[1];
-  v[10] = blake_u256[2];
-  v[11] = blake_u256[3];
-  v[12] = blake_u256[4];
-  v[13] = blake_u256[5];
-  v[14] = blake_u256[6];
-  v[15] = blake_u256[7];
-
-  v[12] ^= S->t;
-  v[13] ^= S->t;
-
-  for (r = 0; r < NB_ROUNDS32; ++r)
-  {
-    /* column step */
-    G(0, 4, 8, 12, 0);
-    G(1, 5, 9, 13, 2);
-    G(2, 6, 10, 14, 4);
-    G(3, 7, 11, 15, 6);
-    /* diagonal step */
-    G(0, 5, 10, 15, 8);
-    G(1, 6, 11, 12, 10);
-    G(2, 7, 8, 13, 12);
-    G(3, 4, 9, 14, 14);
-  }
-
-  for (i = 0; i < 16; ++i)  S->h[i & 7] ^= v[i];
-}
-
-void blake256_init(blake_state256 *S)
-{
-  S->h[0] = 0x6a09e667;
-  S->h[1] = 0xbb67ae85;
-  S->h[2] = 0x3c6ef372;
-  S->h[3] = 0xa54ff53a;
-  S->h[4] = 0x510e527f;
-  S->h[5] = 0x9b05688c;
-  S->h[6] = 0x1f83d9ab;
-  S->h[7] = 0x5be0cd19;
-  S->t = 0;
-}
-
-void blake256_update(blake_state256 *S, const uint32_t *in)
-{
-  uint32_t m[16];
-  int i;
-  S->t = 512;
-  for (i = 0; i < 16; ++i)  m[i] = in[i];
-  blake256_compress_block(S, m);
-}
-
-void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
-{
-  blake_state256 S;
-  blake256_init(&S);
-  blake256_update(&S, data);
-
-  blk->ctx_a = S.h[0];
-  blk->ctx_b = S.h[1];
-  blk->ctx_c = S.h[2];
-  blk->ctx_d = S.h[3];
-  blk->ctx_e = S.h[4];
-  blk->ctx_f = S.h[5];
-  blk->ctx_g = S.h[6];
-  blk->ctx_h = S.h[7];
-
-  blk->cty_a = data[16];
-  blk->cty_b = data[17];
-  blk->cty_c = data[18];
-}
diff --git a/findnonce.h b/findnonce.h
index 49b1aa9a..9376a57b 100644
--- a/findnonce.h
+++ b/findnonce.h
@@ -10,6 +10,5 @@
 
 extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
 extern void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res);
-extern void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
 
 #endif /*FINDNONCE_H*/
diff --git a/kernel/blake256r14.cl b/kernel/blake256r14.cl
new file mode 100644
index 00000000..e94b4d7c
--- /dev/null
+++ b/kernel/blake256r14.cl
@@ -0,0 +1,157 @@
+// (c) 2013 originally written by smolen, modified by kr105
+
+#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	volatile __global uint * restrict output,
+	// precalc hash from fisrt part of message
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+	// last 12 bytes of original message
+	const uint in16,
+	const uint in17,
+	const uint in18
+)
+{
+	uint M0, M1, M2, M3, M4, M5, M6, M7;
+	uint M8, M9, MA, MB, MC, MD, ME, MF;
+	uint V0, V1, V2, V3, V4, V5, V6, V7;
+	uint V8, V9, VA, VB, VC, VD, VE, VF;
+	uint pre7;
+	uint nonce = get_global_id(0);
+
+	V0 = h0;
+	V1 = h1;
+	V2 = h2;
+	V3 = h3;
+	V4 = h4;
+	V5 = h5;
+	V6 = h6;
+	pre7 = V7 = h7;
+	M0 = in16;
+	M1 = in17;
+	M2 = in18;
+	M3 = nonce;
+
+	V8 = 0x243F6A88UL;
+	V9 = 0x85A308D3UL;
+	VA = 0x13198A2EUL;
+	VB = 0x03707344UL;
+	VC = 640 ^ 0xA4093822UL;
+	VD = 640 ^ 0x299F31D0UL;
+	VE = 0x082EFA98UL;
+	VF = 0xEC4E6C89UL;
+
+	M4 = 0x80000000;
+	M5 = 0;
+	M6 = 0;
+	M7 = 0;
+	M8 = 0;
+	M9 = 0;
+	MA = 0;
+	MB = 0;
+	MC = 0;
+	MD = 1;
+	ME = 0;
+	MF = 640;
+
+	V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	// Constants
+	// 00 = 0x243F6A88UL 
+	// 01 = 0x85A308D3UL 
+	// 02 = 0x13198A2EUL 
+	// 03 = 0x03707344UL 
+	// 04 = 0xA4093822UL 
+	// 05 = 0x299F31D0UL 
+	// 06 = 0x082EFA98UL 
+	// 07 = 0xEC4E6C89UL 
+	// 08 = 0x452821E6UL 
+	// 09 = 0x38D01377UL 
+	// 10 = 0xBE5466CFUL 
+	// 11 = 0x34E90C6CUL 
+	// 12 = 0xC0AC29B7UL 
+	// 13 = 0xC97C50DDUL 
+	// 14 = 0x3F84D5B5UL
+	// 15 = 0xB5470917UL 
+	// A=10,B=11,C=12,D=13,E=14,F=15
+
+	// Round 9: 
+	// 6^15
+	V0 = (V0 + V4 + (M6 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MF ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; 
+	// 14^9
+	V1 = (V1 + V5 + (ME ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M9 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; 
+	// 11^3 
+	V2 = (V2 + V6 + (MB ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M3 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; 
+	// 0^8 
+	V3 = (V3 + V7 + (M0 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M8 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; 
+	// 12^2 
+	V0 = (V0 + V5 + (MC ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M2 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; 
+	// 13^7 
+	V1 = (V1 + V6 + (MD ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M7 ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; 
+	// 1^4 
+	V2 = (V2 + V7 + (M1 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M4 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; 
+	// 10^5
+	V3 = (V3 + V4 + (MA ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M5 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	// Constants
+	// 00 = 0x243F6A88UL 
+	// 01 = 0x85A308D3UL 
+	// 02 = 0x13198A2EUL 
+	// 03 = 0x03707344UL 
+	// 04 = 0xA4093822UL 
+	// 05 = 0x299F31D0UL 
+	// 06 = 0x082EFA98UL 
+	// 07 = 0xEC4E6C89UL 
+	// 08 = 0x452821E6UL 
+	// 09 = 0x38D01377UL 
+	// 10 = 0xBE5466CFUL 
+	// 11 = 0x34E90C6CUL 
+	// 12 = 0xC0AC29B7UL 
+	// 13 = 0xC97C50DDUL 
+	// 14 = 0x3F84D5B5UL
+	// 15 = 0xB5470917UL 
+	// A=10,B=11,C=12,D=13,E=14,F=15
+
+	// Round 10
+	// 10^2
+	V0 = (V0 + V4 + (MA ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M2 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; 
+	// 8^4
+	V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; 
+	// 7^6
+	V2 = (V2 + V6 + (M7 ^ 0x082EFA98UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M6 ^ 0xEC4E6C89UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; 
+	// 1^5
+	V3 = (V3 + V7 + (M1 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M5 ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; 
+	// 15^11
+	V0 = (V0 + V5 + (MF ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MB ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; 
+	// 9^14
+	V1 = (V1 + V6 + (M9 ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (ME ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; 
+	// 3^12
+	V2 = (V2 + V7 + (M3 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MC ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; 
+	// 13^0
+	V3 = (V3 + V4 + (MD ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M0 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	// Round 11,12,13,14 repeated from beginning again
+	V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	if(pre7 ^ V7 ^ VF)
+		return;
+	output[output[0xFF]++] = nonce;
+}
\ No newline at end of file
diff --git a/kernel/blake256r8.cl b/kernel/blake256r8.cl
new file mode 100644
index 00000000..fca62fe7
--- /dev/null
+++ b/kernel/blake256r8.cl
@@ -0,0 +1,77 @@
+// (c) 2013 originally written by smolen, modified by kr105
+
+#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	volatile __global uint * restrict output,
+	// precalc hash from fisrt part of message
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+	// last 12 bytes of original message
+	const uint in16,
+	const uint in17,
+	const uint in18
+)
+{
+	uint M0, M1, M2, M3, M4, M5, M6, M7;
+	uint M8, M9, MA, MB, MC, MD, ME, MF;
+	uint V0, V1, V2, V3, V4, V5, V6, V7;
+	uint V8, V9, VA, VB, VC, VD, VE, VF;
+	uint pre7;
+	uint nonce = get_global_id(0);
+
+	V0 = h0;
+	V1 = h1;
+	V2 = h2;
+	V3 = h3;
+	V4 = h4;
+	V5 = h5;
+	V6 = h6;
+	pre7 = V7 = h7;
+	M0 = in16;
+	M1 = in17;
+	M2 = in18;
+	M3 = nonce;
+
+	V8 = 0x243F6A88UL;
+	V9 = 0x85A308D3UL;
+	VA = 0x13198A2EUL;
+	VB = 0x03707344UL;
+	VC = 640 ^ 0xA4093822UL;
+	VD = 640 ^ 0x299F31D0UL;
+	VE = 0x082EFA98UL;
+	VF = 0xEC4E6C89UL;
+
+	M4 = 0x80000000;
+	M5 = 0;
+	M6 = 0;
+	M7 = 0;
+	M8 = 0;
+	M9 = 0;
+	MA = 0;
+	MB = 0;
+	MC = 0;
+	MD = 1;
+	ME = 0;
+	MF = 640;
+
+	V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	if(pre7 ^ V7 ^ VF)
+		return;
+	output[output[0xFF]++] = nonce;
+}
\ No newline at end of file
diff --git a/kernel/lyra2rev2.cl b/kernel/lyra2rev2.cl
index 0fe0440d..a165a751 100644
--- a/kernel/lyra2rev2.cl
+++ b/kernel/lyra2rev2.cl
@@ -31,8 +31,8 @@
 // typedef unsigned int uint;
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
-#ifndef LYRA2RE_CL
-#define LYRA2RE_CL
+#ifndef LYRA2REV2_CL
+#define LYRA2REV2_CL
 
 #if __ENDIAN_LITTLE__
 #define SPH_LITTLE_ENDIAN 1
@@ -522,4 +522,4 @@ __kernel void search6(__global uchar* hashes, __global uint* output, const ulong
 }
 
 
-#endif // LYRA2RE_CL
\ No newline at end of file
+#endif // LYRA2REV2_CL
\ No newline at end of file
diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl
index 7939d7ed..9ffcad4b 100644
--- a/kernel/neoscrypt.cl
+++ b/kernel/neoscrypt.cl
@@ -1,9 +1,32 @@
-/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
-/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */
+// NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20
+// By Wolf (Wolf0 aka Wolf9466)
 
 // Stupid AMD compiler ignores the unroll pragma in these two
+
+// Tahiti 3/2, 
+// Hawaii 4/4 + notneededswap
+// Pitcairn 3/4 + notneededswap
+#if defined(__Tahiti__)
+#define SALSA_SMALL_UNROLL 4
+#define CHACHA_SMALL_UNROLL 2
+//#define SWAP 1
+//#define SHITMAIN 1
+//#define WIDE_STRIPE 1
+#elif defined(__Pitcairn__)
+
 #define SALSA_SMALL_UNROLL 3
-#define CHACHA_SMALL_UNROLL 3
+#define CHACHA_SMALL_UNROLL 2
+//#define SWAP 1
+//#define SHITMAIN 1
+//#define WIDE_STRIPE 1
+
+#else
+#define SALSA_SMALL_UNROLL 4
+#define CHACHA_SMALL_UNROLL 4
+//#define SWAP 1
+//#define SHITMAIN 1
+//#define WIDE_STRIPE 1
+#endif
 
 // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
 // as the unroll factor; must divide cleanly into ten.
@@ -96,6 +119,28 @@ static const __constant uchar BLAKE2S_SIGMA[10][16] =
 	b = rotate(b ^ c, 25U); \
 } while(0)
 
+#define BLAKE_PARALLEL_G1(idx0, a, b, c, d, key)	do { \
+	a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][0]], key[BLAKE2S_SIGMA[idx0][2]], key[BLAKE2S_SIGMA[idx0][4]], key[BLAKE2S_SIGMA[idx0][6]]); \
+	d = rotate(d ^ a, 16U); \
+	c += d; \
+	b = rotate(b ^ c, 20U); \
+	a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][1]], key[BLAKE2S_SIGMA[idx0][3]], key[BLAKE2S_SIGMA[idx0][5]], key[BLAKE2S_SIGMA[idx0][7]]); \
+	d = rotate(d ^ a, 24U); \
+	c += d; \
+	b = rotate(b ^ c, 25U); \
+} while(0)
+
+#define BLAKE_PARALLEL_G2(idx0, a, b, c, d, key)	do { \
+	a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][8]], key[BLAKE2S_SIGMA[idx0][10]], key[BLAKE2S_SIGMA[idx0][12]], key[BLAKE2S_SIGMA[idx0][14]]); \
+	d = rotate(d ^ a, 16U); \
+	c += d; \
+	b = rotate(b ^ c, 20U); \
+	a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][9]], key[BLAKE2S_SIGMA[idx0][11]], key[BLAKE2S_SIGMA[idx0][13]], key[BLAKE2S_SIGMA[idx0][15]]); \
+	d = rotate(d ^ a, 24U); \
+	c += d; \
+	b = rotate(b ^ c, 25U); \
+} while(0)
+
 void Blake2S(uint *restrict inout, const uint *restrict inkey)
 {
 	uint16 V;
@@ -122,14 +167,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey)
 	#endif
 	for(int x = 0; x < 10; ++x)
 	{
-		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
+		/*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
 		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
 		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
 		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
 		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
 		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
 		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
-		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);*/
+		
+		BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
+		BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
 	}
 
 	// XOR low part of state with the high part,
@@ -156,14 +204,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey)
 	#endif
 	for(int x = 0; x < 10; ++x)
 	{
-		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
+		/*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
 		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
 		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
 		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
 		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
 		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
 		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
-		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);*/
+		
+		BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
+		BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
 	}
 
 	// XOR low part of state with high part, then with input block
@@ -227,15 +278,73 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u
     {
 		// Make the key buffer twice the size of the key so it fits a Blake2S block
 		// This way, we don't need a temp buffer in the Blake2S function.
-		uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };
+		uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
 
 		// Copy input and key to their buffers
 		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
 		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
 
         // PRF
-        Blake2S((uint *)input, (uint *)key);
+        //Blake2S((uint *)input, (uint *)key);
+		
+		uint *inkey = (uint *)key, *inout = (uint *)input;
+		
+        // PRF
+        uint16 V;
+		uint8 tmpblock;
+
+		// Load first block (IV into V.lo) and constants (IV into V.hi)
+		V.lo = V.hi = vload8(0U, BLAKE2S_IV);
+
+		// XOR with initial constant
+		V.s0 ^= 0x01012020;
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message so far (including this block)
+		// There are two uints for this field, but high uint is zero
+		V.sc ^= BLAKE2S_BLOCK_SIZE;
+
+		// Compress state, using the key as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
+		}
+
+		// XOR low part of state with the high part,
+		// then with the original input block.
+		V.lo ^= V.hi ^ tmpblock;
 
+		// Load constants (IV into V.hi)
+		V.hi = vload8(0U, BLAKE2S_IV);
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message into block again
+		V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
+
+		// Last block compression - XOR final constant into state
+		V.se ^= 0xFFFFFFFFU;
+
+		// Compress block, using the input as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{	
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
+		}
+
+		// XOR low part of state with high part, then with input block
+		V.lo ^= V.hi ^ tmpblock;
+
+		// Store result in input/output buffer
+		vstore8(V.lo, 0, inout);
+
+		
         // Calculate the next buffer pointer
 		bufidx = 0;
 
@@ -284,7 +393,475 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u
 	}
 }
 
-#define SALSA_CORE(state)	do { \
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+void fastkdf1(const uchar password[80], uchar output[256])
+{
+
+	/*                    WARNING!
+	 * This algorithm uses byte-wise addressing for memory blocks.
+	 * Or in other words, trying to copy an unaligned memory region
+	 * will significantly slow down the algorithm, when copying uses
+	 * words or bigger entities. It even may corrupt the data, when
+	 * the device does not support it properly.
+	 * Therefore use byte copying, which will not the fastest but at
+	 * least get reliable results. */
+
+	// BLOCK_SIZE            64U
+	// FASTKDF_BUFFER_SIZE  256U
+	// BLAKE2S_BLOCK_SIZE    64U
+	// BLAKE2S_KEY_SIZE      32U
+	// BLAKE2S_OUT_SIZE      32U
+	uchar bufidx = 0;
+	uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
+	uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
+	
+	// Initialize the password buffer
+	#pragma unroll 1
+	for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)A)[i] = ((ulong *)password)[i % 10];
+
+	((uint16 *)(B + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
+
+    // The primary iteration
+    #pragma unroll 1
+    for(int i = 0; i < 32; ++i)
+    {
+		// Make the key buffer twice the size of the key so it fits a Blake2S block
+		// This way, we don't need a temp buffer in the Blake2S function.
+		uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
+		
+		// Copy input and key to their buffers
+		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
+		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
+		
+		uint *inkey = (uint *)key, *inout = (uint *)input;
+		
+		#ifndef __Hawaii__
+		
+        // PRF
+        uint4 V[4];
+		uint8 tmpblock;
+		
+		tmpblock = vload8(0U, BLAKE2S_IV);
+		
+		V[0] = V[2] = tmpblock.lo;
+		V[1] = V[3] = tmpblock.hi;
+		
+		V[0].s0 ^= 0x01012020U;
+		tmpblock.lo = V[0];
+		
+		V[3].s0 ^= BLAKE2S_BLOCK_SIZE;
+
+		// Compress state, using the key as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{
+			BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey);
+			BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey);
+		}
+		
+		V[0] ^= V[2] ^ tmpblock.lo;
+		V[1] ^= V[3] ^ tmpblock.hi;
+		
+		V[2] = vload4(0U, BLAKE2S_IV);
+		V[3] = vload4(1U, BLAKE2S_IV);
+		
+		tmpblock.lo = V[0];
+		tmpblock.hi = V[1];
+		
+		V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1;
+		V[3].s2 ^= 0xFFFFFFFFU;
+
+		// Compress block, using the input as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{	
+			BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout);
+			BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout);
+		}
+		
+		V[0] ^= V[2] ^ tmpblock.lo;
+		V[1] ^= V[3] ^ tmpblock.hi;
+		
+		vstore4(V[0], 0, inout);
+		vstore4(V[1], 1, inout);
+		
+		#else
+		
+        // PRF
+        uint16 V;
+		uint8 tmpblock;
+
+		// Load first block (IV into V.lo) and constants (IV into V.hi)
+		V.lo = V.hi = vload8(0U, BLAKE2S_IV);
+
+		// XOR with initial constant
+		V.s0 ^= 0x01012020;
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message so far (including this block)
+		// There are two uints for this field, but high uint is zero
+		V.sc ^= BLAKE2S_BLOCK_SIZE;
+
+		// Compress state, using the key as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
+		}
+
+		// XOR low part of state with the high part,
+		// then with the original input block.
+		V.lo ^= V.hi ^ tmpblock;
+
+		// Load constants (IV into V.hi)
+		V.hi = vload8(0U, BLAKE2S_IV);
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message into block again
+		V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
+
+		// Last block compression - XOR final constant into state
+		V.se ^= 0xFFFFFFFFU;
+
+		// Compress block, using the input as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{	
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
+		}
+
+		// XOR low part of state with high part, then with input block
+		V.lo ^= V.hi ^ tmpblock;
+
+		// Store result in input/output buffer
+		vstore8(V.lo, 0, inout);
+		
+		#endif
+		
+        // Calculate the next buffer pointer
+		bufidx = 0;
+
+		for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
+			bufidx += input[x];
+
+		// bufidx a uchar now - always mod 255
+		//bufidx &= (FASTKDF_BUFFER_SIZE - 1);
+
+        // Modify the salt buffer
+		XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
+
+		if(bufidx < BLAKE2S_KEY_SIZE)
+		{
+			// Head modified, tail updated
+			// this was made off the original code... wtf
+			//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
+			CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
+		}
+		else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
+		{
+			// Tail modified, head updated
+			CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
+		}
+    }
+
+    // Modify and copy into the output buffer
+
+    // Damned compiler crashes
+    // Fuck you, AMD
+
+	//for(uint i = 0; i < output_len; ++i, ++bufidx)
+	//	output[i] = B[bufidx] ^ A[i];
+
+    uint left = FASTKDF_BUFFER_SIZE - bufidx;
+	//uint left = (~bufidx) + 1
+
+	if(left < 256)
+	{
+		XORBytes(output, B + bufidx, A, left);
+		XORBytes(output + left, B, A + left, 256 - left);
+	}
+	else
+	{
+		XORBytes(output, B + bufidx, A, 256);
+	}
+}
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+void fastkdf2(const uchar password[80], const uchar salt[256],  __global uint* restrict output, const uint target)
+{
+
+	/*                    WARNING!
+	 * This algorithm uses byte-wise addressing for memory blocks.
+	 * Or in other words, trying to copy an unaligned memory region
+	 * will significantly slow down the algorithm, when copying uses
+	 * words or bigger entities. It even may corrupt the data, when
+	 * the device does not support it properly.
+	 * Therefore use byte copying, which will not the fastest but at
+	 * least get reliable results. */
+
+	// BLOCK_SIZE            64U
+	// FASTKDF_BUFFER_SIZE  256U
+	// BLAKE2S_BLOCK_SIZE    64U
+	// BLAKE2S_KEY_SIZE      32U
+	// BLAKE2S_OUT_SIZE      32U
+	// salt_len == 256, output_len == 32
+	uchar bufidx = 0;
+	uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
+	uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
+	//uchar A[256], B[256];
+	
+	// Initialize the password buffer
+	#pragma unroll 1
+	for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
+
+	((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
+
+	// Initialize the salt buffer
+	((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
+	((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
+
+    // The primary iteration
+	#pragma unroll 1
+    for(int i = 0; i < 32; ++i)
+    {
+		// Make the key buffer twice the size of the key so it fits a Blake2S block
+		// This way, we don't need a temp buffer in the Blake2S function.
+		uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
+		
+		// Copy input and key to their buffers
+		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
+		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
+		
+		uint *inkey = (uint *)key, *inout = (uint *)input;
+		
+		#ifndef __Hawaii__
+		
+        // PRF
+        uint4 V[4];
+		uint8 tmpblock;
+		
+		tmpblock = vload8(0U, BLAKE2S_IV);
+		
+		V[0] = V[2] = tmpblock.lo;
+		V[1] = V[3] = tmpblock.hi;
+		
+		V[0].s0 ^= 0x01012020U;
+		tmpblock.lo = V[0];
+		
+		V[3].s0 ^= BLAKE2S_BLOCK_SIZE;
+
+		// Compress state, using the key as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{
+			BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey);
+			BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey);
+		}
+		
+		V[0] ^= V[2] ^ tmpblock.lo;
+		V[1] ^= V[3] ^ tmpblock.hi;
+		
+		V[2] = vload4(0U, BLAKE2S_IV);
+		V[3] = vload4(1U, BLAKE2S_IV);
+		
+		tmpblock.lo = V[0];
+		tmpblock.hi = V[1];
+		
+		V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1;
+		V[3].s2 ^= 0xFFFFFFFFU;
+
+		// Compress block, using the input as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{	
+			BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout);
+			BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout);
+		}
+		
+		V[0] ^= V[2] ^ tmpblock.lo;
+		V[1] ^= V[3] ^ tmpblock.hi;
+		
+		vstore4(V[0], 0, inout);
+		vstore4(V[1], 1, inout);
+		
+		#else
+		
+        // PRF
+        uint16 V;
+		uint8 tmpblock;
+
+		// Load first block (IV into V.lo) and constants (IV into V.hi)
+		V.lo = V.hi = vload8(0U, BLAKE2S_IV);
+
+		// XOR with initial constant
+		V.s0 ^= 0x01012020;
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message so far (including this block)
+		// There are two uints for this field, but high uint is zero
+		V.sc ^= BLAKE2S_BLOCK_SIZE;
+
+		// Compress state, using the key as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
+		}
+
+		// XOR low part of state with the high part,
+		// then with the original input block.
+		V.lo ^= V.hi ^ tmpblock;
+
+		// Load constants (IV into V.hi)
+		V.hi = vload8(0U, BLAKE2S_IV);
+
+		// Copy input block for later
+		tmpblock = V.lo;
+
+		// XOR length of message into block again
+		V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
+
+		// Last block compression - XOR final constant into state
+		V.se ^= 0xFFFFFFFFU;
+
+		// Compress block, using the input as the key
+		#pragma unroll
+		for(int x = 0; x < 10; ++x)
+		{	
+			BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
+			BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
+		}
+
+		// XOR low part of state with high part, then with input block
+		V.lo ^= V.hi ^ tmpblock;
+
+		// Store result in input/output buffer
+		vstore8(V.lo, 0, inout);
+		#endif
+		
+        // Calculate the next buffer pointer
+		bufidx = 0;
+
+		for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
+			bufidx += input[x];
+
+		// bufidx a uchar now - always mod 255
+		//bufidx &= (FASTKDF_BUFFER_SIZE - 1);
+
+        // Modify the salt buffer
+		XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
+
+		if(bufidx < BLAKE2S_KEY_SIZE)
+		{
+			// Head modified, tail updated
+			// this was made off the original code... wtf
+			//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
+			CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
+		}
+		else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
+		{
+			// Tail modified, head updated
+			CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
+		}		
+    }
+
+    // Modify and copy into the output buffer
+
+    // Damned compiler crashes
+    // Fuck you, AMD
+	
+	uchar outbuf[32];
+	
+	for(uint i = 0; i < 32; ++i, ++bufidx)
+		outbuf[i] = B[bufidx] ^ A[i];
+
+    /*uint left = FASTKDF_BUFFER_SIZE - bufidx;
+	//uint left = (~bufidx) + 1
+	uchar outbuf[32];
+
+	if(left < 32)
+	{
+		XORBytes(outbuf, B + bufidx, A, left);
+		XORBytes(outbuf + left, B, A + left, 32 - left);
+	}
+	else
+	{
+		XORBytes(outbuf, B + bufidx, A, 32);
+	}*/
+	
+	if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
+
+}
+
+/*
+ s0 s1 s2 s3
+ s4 s5 s6 s7
+ s8 s9 sa sb
+ sc sd se sf
+shittify:
+s0=s4
+s1=s9
+s2=se
+s3=s3
+s4=s8
+s5=sd
+s6=s2
+s7=s7
+s8=sc
+s9=s1
+sa=s6
+sb=sb
+sc=s0
+sd=s5
+se=sa
+sf=sf
+unshittify:
+s0=sc
+s1=s9
+s2=s6
+s3=s3
+s4=s0
+s5=sd
+s6=sa
+s7=s7
+s8=s4
+s9=s1
+sa=se
+sb=sb
+sc=s8
+sd=s5
+se=s2
+sf=sf
+
+*/
+
+#define SALSA_CORE(state)       do { \
+	state[0] ^= rotate(state[3] + state[2], 7U); \
+	state[1] ^= rotate(state[0] + state[3], 9U); \
+	state[2] ^= rotate(state[1] + state[0], 13U); \
+	state[3] ^= rotate(state[2] + state[1], 18U); \
+	state[2] ^= rotate(state[3].wxyz + state[0].zwxy, 7U); \
+	state[1] ^= rotate(state[2].wxyz + state[3].zwxy, 9U); \
+	state[0] ^= rotate(state[1].wxyz + state[2].zwxy, 13U); \
+	state[3] ^= rotate(state[0].wxyz + state[1].zwxy, 18U); \
+} while(0)
+
+#define SALSA_CORE_SCALAR(state)	do { \
 	state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
 	state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
 	state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
@@ -295,10 +872,18 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u
 	state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
 } while(0)
 
-uint16 salsa_small_scalar_rnd(uint16 X)
+uint16 salsa_small_parallel_rnd(uint16 X)
 {
-	uint16 st = X;
-
+#ifndef SHITMAIN
+	uint4 st[4] = {	(uint4)(X.s4, X.s9, X.se, X.s3),
+				 	(uint4)(X.s8, X.sd, X.s2, X.s7),
+				 	(uint4)(X.sc, X.s1, X.s6, X.sb),
+					(uint4)(X.s0, X.s5, X.sa, X.sf)  };   
+#else
+	uint4 st[4];
+	((uint16 *)st)[0] = X;
+#endif
+	
 	#if SALSA_SMALL_UNROLL == 1
 
 	for(int i = 0; i < 10; ++i)
@@ -335,7 +920,7 @@ uint16 salsa_small_scalar_rnd(uint16 X)
 		SALSA_CORE(st);
 	}
 
-	#else
+	#elif SALSA_SMALL_UNROLL == 5
 
 	for(int i = 0; i < 2; ++i)
 	{
@@ -346,26 +931,114 @@ uint16 salsa_small_scalar_rnd(uint16 X)
 		SALSA_CORE(st);
 	}
 
+	#else
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+
 	#endif
 
+#ifndef SHITMAIN
+	return(X + (uint16)(
+		st[3].x, st[2].y, st[1].z, st[0].w,
+		st[0].x, st[3].y, st[2].z, st[1].w,
+		st[1].x, st[0].y, st[3].z, st[2].w,
+		st[2].x, st[1].y, st[0].z, st[3].w));
+#else
+	return(X + ((uint16 *)st)[0]);
+#endif
+}
+
+uint16 salsa_small_scalar_rnd(uint16 X)
+{
+	uint16 st = X;
+	
+	#if SALSA_SMALL_UNROLL == 1
+	
+	for(int i = 0; i < 10; ++i)
+	{
+		SALSA_CORE_SCALAR(st);
+	}
+	
+	#elif SALSA_SMALL_UNROLL == 2
+	
+	for(int i = 0; i < 5; ++i)
+	{
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+	}
+	
+	#elif SALSA_SMALL_UNROLL == 3
+	
+	for(int i = 0; i < 4; ++i)
+	{
+		SALSA_CORE_SCALAR(st);
+		if(i == 3) break;
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+	}
+	
+	#elif SALSA_SMALL_UNROLL == 4
+	
+	for(int i = 0; i < 3; ++i)
+	{
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+		if(i == 2) break;
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+	}
+	
+	#else
+	
+	for(int i = 0; i < 2; ++i)
+	{
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+		SALSA_CORE_SCALAR(st);
+	}
+	
+	#endif
+	
 	return(X + st);
 }
 
+
 #define CHACHA_CORE_PARALLEL(state)	do { \
-	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
-	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \
-	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
-	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 16U); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 12U); \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 8U); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 7U); \
 	\
-	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
-	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \
-	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
-	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 16); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 12U); \
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 8U); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 7U); \
+} while(0)
+
+#define CHACHA_CORE(state)	do { \
+	state.s0 += state.s4; state.sc = as_uint(as_ushort2(state.sc ^ state.s0).s10); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 12U); state.s0 += state.s4; state.sc = rotate(state.sc ^ state.s0, 8U); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 7U); \
+	state.s1 += state.s5; state.sd = as_uint(as_ushort2(state.sd ^ state.s1).s10); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 12U); state.s1 += state.s5; state.sd = rotate(state.sd ^ state.s1, 8U); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 7U); \
+	state.s2 += state.s6; state.se = as_uint(as_ushort2(state.se ^ state.s2).s10); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 12U); state.s2 += state.s6; state.se = rotate(state.se ^ state.s2, 8U); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 7U); \
+	state.s3 += state.s7; state.sf = as_uint(as_ushort2(state.sf ^ state.s3).s10); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 12U); state.s3 += state.s7; state.sf = rotate(state.sf ^ state.s3, 8U); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 7U); \
+	state.s0 += state.s5; state.sf = as_uint(as_ushort2(state.sf ^ state.s0).s10); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 12U); state.s0 += state.s5; state.sf = rotate(state.sf ^ state.s0, 8U); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 7U); \
+	state.s1 += state.s6; state.sc = as_uint(as_ushort2(state.sc ^ state.s1).s10); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 12U); state.s1 += state.s6; state.sc = rotate(state.sc ^ state.s1, 8U); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 7U); \
+	state.s2 += state.s7; state.sd = as_uint(as_ushort2(state.sd ^ state.s2).s10); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 12U); state.s2 += state.s7; state.sd = rotate(state.sd ^ state.s2, 8U); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 7U); \
+	state.s3 += state.s4; state.se = as_uint(as_ushort2(state.se ^ state.s3).s10); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 12U); state.s3 += state.s4; state.se = rotate(state.se ^ state.s3, 8U); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 7U); \
 } while(0)
 
 uint16 chacha_small_parallel_rnd(uint16 X)
 {
-	uint4 t, st[4];
+	uint4 st[4];
 
 	((uint16 *)st)[0] = X;
 
@@ -405,7 +1078,7 @@ uint16 chacha_small_parallel_rnd(uint16 X)
 		CHACHA_CORE_PARALLEL(st);
 	}
 
-	#else
+	#elif CHACHA_SMALL_UNROLL == 5
 
 	for(int i = 0; i < 2; ++i)
 	{
@@ -415,15 +1088,95 @@ uint16 chacha_small_parallel_rnd(uint16 X)
 		CHACHA_CORE_PARALLEL(st);
 		CHACHA_CORE_PARALLEL(st);
 	}
+	#else
+	
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
+	CHACHA_CORE_PARALLEL(st);
 
 	#endif
 
 	return(X + ((uint16 *)st)[0]);
 }
 
-void neoscrypt_blkmix(uint16 *XV, bool alg)
-{
+uint16 chacha_small_scalar_rnd(uint16 X)
+{   
+	uint16 st = X;
+	
+	#if CHACHA_SMALL_UNROLL == 1
+	
+	for(int i = 0; i < 10; ++i)
+	{
+		CHACHA_CORE(st);
+	}
+	
+	#elif CHACHA_SMALL_UNROLL == 2
+	
+	for(int i = 0; i < 5; ++i)
+	{
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+	}
+	
+	#elif CHACHA_SMALL_UNROLL == 3
+	
+	for(int i = 0; i < 4; ++i)
+	{
+		CHACHA_CORE(st);
+		if(i == 3) break;
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+	}
+	
+	#elif CHACHA_SMALL_UNROLL == 4
+	
+	for(int i = 0; i < 3; ++i)
+	{
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+		if(i == 2) break;
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+	}
+	
+	#elif CHACHA_SMALL_UNROLL == 5
+	
+	for(int i = 0; i < 2; ++i)
+	{
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+		CHACHA_CORE(st);
+	}
+	
+	#else
+	
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	CHACHA_CORE(st);
+	
+	#endif
+		
+	return(X + st);
+}
 
+void neoscrypt_blkmix_salsa(uint16 XV[4])
+{
     /* NeoScrypt flow:                   Scrypt flow:
          Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
          Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
@@ -431,48 +1184,135 @@ void neoscrypt_blkmix(uint16 *XV, bool alg)
          Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
          Xa" = Ya; Xb" = Yc;
          Xc" = Yb; Xd" = Yd; */
-
+#if 0
+	for(int i = 0; i < 4; ++i) XV[i] = (uint16)(
+		XV[i].s4, XV[i].s9, XV[i].se, XV[i].s3, XV[i].s8, XV[i].sd, XV[i].s2, XV[i].s7, 
+		XV[i].sc, XV[i].s1, XV[i].s6, XV[i].sb, XV[i].s0, XV[i].s5, XV[i].sa, XV[i].sf);   
+#endif
 	XV[0] ^= XV[3];
 
-	if(!alg)
-	{
-		XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
-		XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
-		XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
-		XV[3] = salsa_small_scalar_rnd(XV[3]);
-	}
-	else
-	{
-		XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
-		XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
-		XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
-		XV[3] = chacha_small_parallel_rnd(XV[3]);
-	}
+	XV[0] = salsa_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
+	XV[1] = salsa_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
+	XV[2] = salsa_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
+	XV[3] = salsa_small_parallel_rnd(XV[3]);
+	
+	//XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
+	//XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
+	//XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
+	//XV[3] = salsa_small_scalar_rnd(XV[3]);
+	
+	XV[1] ^= XV[2];
+	XV[2] ^= XV[1];
+	XV[1] ^= XV[2];
+#if 0
+	XV[0] = (uint16)(XV[0].sc, XV[0].s9, XV[0].s6, XV[0].s3, XV[0].s0, XV[0].sd, XV[0].sa, XV[0].s7, XV[0].s4, XV[0].s1, XV[0].se, XV[0].sb, XV[0].s8, XV[0].s5, XV[0].s2, XV[0].sf);
+	XV[1] = (uint16)(XV[1].sc, XV[1].s9, XV[1].s6, XV[1].s3, XV[1].s0, XV[1].sd, XV[1].sa, XV[1].s7, XV[1].s4, XV[1].s1, XV[1].se, XV[1].sb, XV[1].s8, XV[1].s5, XV[1].s2, XV[1].sf);
+	XV[2] = (uint16)(XV[2].sc, XV[2].s9, XV[2].s6, XV[2].s3, XV[2].s0, XV[2].sd, XV[2].sa, XV[2].s7, XV[2].s4, XV[2].s1, XV[2].se, XV[2].sb, XV[2].s8, XV[2].s5, XV[2].s2, XV[2].sf);
+	XV[3] = (uint16)(XV[3].sc, XV[3].s9, XV[3].s6, XV[3].s3, XV[3].s0, XV[3].sd, XV[3].sa, XV[3].s7, XV[3].s4, XV[3].s1, XV[3].se, XV[3].sb, XV[3].s8, XV[3].s5, XV[3].s2, XV[3].sf);
+#endif
+}
+
+void neoscrypt_blkmix_chacha(uint16 XV[4])
+{
+
+    /* NeoScrypt flow:                   Scrypt flow:
+         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+         Xa" = Ya; Xb" = Yc;
+         Xc" = Yb; Xd" = Yd; */
 
+	XV[0] ^= XV[3];
+	
+	#if 1
+	
+	XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
+	XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
+	XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
+	XV[3] = chacha_small_parallel_rnd(XV[3]);
+	
+	#else
+	
+	XV[0] = chacha_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
+	XV[1] = chacha_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
+	XV[2] = chacha_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
+	XV[3] = chacha_small_scalar_rnd(XV[3]);
+	
+	#endif
+	
 	XV[1] ^= XV[2];
 	XV[2] ^= XV[1];
 	XV[1] ^= XV[2];
 }
 
+#ifdef WIDE_STRIPE
+
+void ScratchpadStore(__global void *V, void *X, uchar idx)
+{
+	((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0];
+	((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1] = ((ulong16 *)X)[1];
+	//const uint idx2 = mul24(idx << 2, (int)get_global_size(0));
+	//#pragma unroll
+	//for(int i = 0; i < 4; ++i) ((__global uint16 *)V)[idx2 + i] = ((uint16 *)X)[i];
+}
+
+void ScratchpadMix(void *X, const __global void *V, uchar idx)
+{
+	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))];
+	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1];
+}
+
+#else
+
 void ScratchpadStore(__global void *V, void *X, uchar idx)
 {
-	((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0];
-	((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1];
+	((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0];
+	((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))] = ((ulong16 *)X)[1];
 }
 
 void ScratchpadMix(void *X, const __global void *V, uchar idx)
 {
-	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1];
-	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1];
+	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))];
+	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))];
+}
+
+#endif
+
+
+
+#define SALSA_PERM		(uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15)
+#define SALSA_INV_PERM	(uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15)
+
+void SMix_Salsa(uint16 X[4], __global uint16 *V)
+{
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		ScratchpadStore(V, X, i);
+		neoscrypt_blkmix_salsa(X);
+	}
+
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		#ifdef SHITMAIN
+		const uint idx = convert_uchar(((uint *)X)[60] & 0x7F);
+		#else
+		const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
+		#endif
+		ScratchpadMix(X, V, idx);
+		neoscrypt_blkmix_salsa(X);
+	}
 }
 
-void SMix(uint16 *X, __global uint16 *V, bool flag)
+void SMix_Chacha(uint16 X[4], __global uint16 *V)
 {
 	#pragma unroll 1
 	for(int i = 0; i < 128; ++i)
 	{
 		ScratchpadStore(V, X, i);
-		neoscrypt_blkmix(X, flag);
+		neoscrypt_blkmix_chacha(X);
 	}
 
 	#pragma unroll 1
@@ -480,10 +1320,13 @@ void SMix(uint16 *X, __global uint16 *V, bool flag)
 	{
 		const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
 		ScratchpadMix(X, V, idx);
-		neoscrypt_blkmix(X, flag);
+		neoscrypt_blkmix_chacha(X);
 	}
 }
 
+#define SALSA_PERM		(uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15)
+#define SALSA_INV_PERM	(uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15)
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
 {
@@ -491,9 +1334,12 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri
 #define CONSTANT_r 2
 	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
 	uint16 X[4], Z[4];
-	/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
-	__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
-	uchar outbuf[32];
+	#ifdef WIDE_STRIPE
+	__global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
+	#else
+	__global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
+	#endif
+	//uchar outbuf[32];
 	uchar data[PASSWORD_LEN];
 
 	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
@@ -502,24 +1348,149 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri
 	((uint *)data)[19] = get_global_id(0);
 
     // X = KDF(password, salt)
-	fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
-
+	//fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
+	fastkdf1(data, (uchar *)X);
+	
+	#ifndef SHITMAIN
     // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
     CopyBytes128(Z, X, 2);
-
+	#else
+	
+	#pragma unroll
+    for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] = shuffle(((uint16 *)X)[i], SALSA_PERM);
+    
+    #endif
+	
     // X = SMix(X); X & Z are swapped, repeat.
-    for(bool flag = false;; ++flag)
+    for(int i = 0;; ++i)
     {
-		SMix(X, V, flag);
-		if(flag) break;
+		#ifdef SWAP
+		if (i) SMix_Salsa(X,V); else SMix_Chacha(X,V);
+		if(i) break;
 		SwapBytes128(X, Z, 256);
+		#else
+		if (i) SMix_Chacha(X,V); else SMix_Salsa(Z,V);
+		if(i) break;
+		#endif
 	}
+	
+	#if defined(SWAP) && defined(SHITMAIN)
+	#pragma unroll
+    for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] ^= shuffle(((uint16 *)X)[i], SALSA_INV_PERM);
+	fastkdf2(data, (uchar *)Z, output, target);
+	#elif defined(SHITMAIN)
+	#pragma unroll
+    for(int i = 0; i < 4; ++i) ((uint16 *)X)[i] ^= shuffle(((uint16 *)Z)[i], SALSA_INV_PERM);
+	fastkdf2(data, (uchar *)X, output, target);
+	#else
+	// blkxor(X, Z)
+	((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
+	((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
+
+	// output = KDF(password, X)
+	//fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
+	fastkdf2(data, (uchar *)X, output, target);
+	#endif
+}
+
+
+/*
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint16 *XZOutput)
+{
+#define CONSTANT_N 128
+#define CONSTANT_r 2
+	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
+	uint16 X[4];
+	XZOutput += (4 * 2 * get_global_id(0));
+	
+	//uchar outbuf[32];
+	uchar data[PASSWORD_LEN];
+
+	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
+	((ulong *)data)[8] = ((__global const ulong *)input)[8];
+	((uint *)data)[18] = ((__global const uint *)input)[18];
+	((uint *)data)[19] = get_global_id(0);
 
+    // X = KDF(password, salt)
+	//fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
+	fastkdf1(data, (uchar *)X);
+	
+	for(int i = 0; i < 4; ++i) XZOutput[i] = X[i];
+	for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i];
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search1(__global uint16 *XZOutput, __global uchar *padcache)
+{
+#define CONSTANT_N 128
+#define CONSTANT_r 2
+	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
+	uint16 X[4], Z[4];
+	#ifdef WIDE_STRIPE
+	__global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
+	#else
+	__global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
+	#endif
+	//uchar outbuf[32];
+	
+	XZOutput += (4 * 2 * get_global_id(0));
+	
+	for(int i = 0; i < 4; ++i) X[i] = XZOutput[i];
+	
+	SMix_Salsa(X,V);
+	
+	for(int i = 0; i < 4; ++i) XZOutput[i] = X[i];
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search2(__global uint16 *XZOutput, __global uchar *padcache)
+{
+#define CONSTANT_N 128
+#define CONSTANT_r 2
+	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
+	uint16 X[4], Z[4];
+	#ifdef WIDE_STRIPE
+	__global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
+	#else
+	__global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
+	#endif
+	//uchar outbuf[32];
+	
+	XZOutput += (4 * 2 * get_global_id(0));
+	
+	for(int i = 0; i < 4; ++i) X[i] = XZOutput[i + 4];
+	
+	SMix_Chacha(X,V);
+	
+	for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i];
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search3(__global const uchar* restrict input, __global uint16 *XZOutput, __global uint* restrict output, const uint target)
+{
+	uint16 X[4], Z[4];
+	uchar data[PASSWORD_LEN];
+
+	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
+	((ulong *)data)[8] = ((__global const ulong *)input)[8];
+	((uint *)data)[18] = ((__global const uint *)input)[18];
+	((uint *)data)[19] = get_global_id(0);
+	
+	XZOutput += (4 * 2 * get_global_id(0));
+	
+	for(int i = 0; i < 4; ++i) X[i] = XZOutput[i];
+	for(int i = 0; i < 4; ++i) Z[i] = XZOutput[i + 4];
+	
 	// blkxor(X, Z)
 	((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
 	((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
 
 	// output = KDF(password, X)
-	fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
-	if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
-}
\ No newline at end of file
+	//fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
+	fastkdf2(data, (uchar *)X, output, target);
+}
+*/
\ No newline at end of file
diff --git a/kernel/vanilla.cl b/kernel/vanilla.cl
new file mode 100644
index 00000000..fca62fe7
--- /dev/null
+++ b/kernel/vanilla.cl
@@ -0,0 +1,77 @@
+// (c) 2013 originally written by smolen, modified by kr105
+
+#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	volatile __global uint * restrict output,
+	// precalc hash from fisrt part of message
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+	// last 12 bytes of original message
+	const uint in16,
+	const uint in17,
+	const uint in18
+)
+{
+	uint M0, M1, M2, M3, M4, M5, M6, M7;
+	uint M8, M9, MA, MB, MC, MD, ME, MF;
+	uint V0, V1, V2, V3, V4, V5, V6, V7;
+	uint V8, V9, VA, VB, VC, VD, VE, VF;
+	uint pre7;
+	uint nonce = get_global_id(0);
+
+	V0 = h0;
+	V1 = h1;
+	V2 = h2;
+	V3 = h3;
+	V4 = h4;
+	V5 = h5;
+	V6 = h6;
+	pre7 = V7 = h7;
+	M0 = in16;
+	M1 = in17;
+	M2 = in18;
+	M3 = nonce;
+
+	V8 = 0x243F6A88UL;
+	V9 = 0x85A308D3UL;
+	VA = 0x13198A2EUL;
+	VB = 0x03707344UL;
+	VC = 640 ^ 0xA4093822UL;
+	VD = 640 ^ 0x299F31D0UL;
+	VE = 0x082EFA98UL;
+	VF = 0xEC4E6C89UL;
+
+	M4 = 0x80000000;
+	M5 = 0;
+	M6 = 0;
+	M7 = 0;
+	M8 = 0;
+	M9 = 0;
+	MA = 0;
+	MB = 0;
+	MC = 0;
+	MD = 1;
+	ME = 0;
+	MF = 640;
+
+	V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+	V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7);
+
+	if(pre7 ^ V7 ^ VF)
+		return;
+	output[output[0xFF]++] = nonce;
+}
\ No newline at end of file
diff --git a/ocl.c b/ocl.c
index cb00790f..ecc383c8 100644
--- a/ocl.c
+++ b/ocl.c
@@ -37,7 +37,7 @@
 #include "algorithm/neoscrypt.h"
 #include "algorithm/pluck.h"
 #include "algorithm/yescrypt.h"
-#include "algorithm/lyra2re.h"
+#include "algorithm/lyra2rev2.h"
 
 /* FIXME: only here for global config vars, replace with configuration.h
  * or similar as soon as config is in a struct instead of littered all
@@ -187,7 +187,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
 	cl_platform_id platform = NULL;
 	struct cgpu_info *cgpu = &gpus[gpu];
 	_clState *clState = (_clState *)calloc(1, sizeof(_clState));
-	cl_uint preferred_vwidth, slot = 0, cpnd = 0, numDevices = clDevicesNum();
+	cl_uint preferred_vwidth, numDevices = clDevicesNum();
 	cl_device_id *devices = (cl_device_id *)alloca(numDevices * sizeof(cl_device_id));
 	build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data));
 	char **pbuff = (char **)alloca(sizeof(char *) * numDevices), filename[256];
@@ -586,7 +586,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   }
 
   // Lyra2re v2 TC
-  else if (cgpu->algorithm.type == ALGO_LYRA2REv2 && !cgpu->opt_tc) {
+  else if (cgpu->algorithm.type == ALGO_LYRA2REV2 && !cgpu->opt_tc) {
     size_t glob_thread_count;
     long max_int;
     unsigned char type = 0;
@@ -797,7 +797,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
       applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
       // scrypt/n-scrypt
     }
-    else if (algorithm->type == ALGO_LYRA2REv2) {
+    else if (algorithm->type == ALGO_LYRA2REV2) {
       /* The scratch/pad-buffer needs 32kBytes memory per thread. */
       bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
       buf1size = 4* 8 * cgpu->thread_concurrency; //matrix
@@ -855,7 +855,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
         return NULL;
       }
     }
-    else if (algorithm->type == ALGO_LYRA2REv2) {
+    else if (algorithm->type == ALGO_LYRA2REV2) {
       // need additionnal buffers
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {
diff --git a/sgminer.c b/sgminer.c
index a418e1d8..ca58f701 100644
--- a/sgminer.c
+++ b/sgminer.c
@@ -2625,7 +2625,7 @@ static void curses_print_devstatus(struct cgpu_info *cgpu, int count)
   if (devcursor + count > LINES - 2)
     return;
 
-  if (count >= most_devices)
+  if (count >= (opt_removedisabled ? most_devices : total_devices))
     return;
 
   if (cgpu->dev_start_tv.tv_sec == 0)
@@ -2745,7 +2745,7 @@ static void switch_logsize(bool __maybe_unused newdevs)
     if (opt_compact) {
       logstart = devcursor + 1;
     } else {
-      logstart = devcursor + most_devices + 1;
+      logstart = devcursor + (opt_removedisabled ? most_devices : total_devices) + 1;
     }
     logcursor = logstart + 1;
 #ifdef WIN32
@@ -8873,7 +8873,7 @@ int main(int argc, char *argv[])
   rd_unlock(&devices_lock);
 
   if (!opt_compact) {
-    logstart += most_devices;
+    logstart += (opt_removedisabled ? most_devices : total_devices);
     logcursor = logstart + 1;
 #ifdef HAVE_CURSES
     check_winsizes();
diff --git a/sph/blake.c b/sph/blake.c
index 1c4a479b..c7b7b14a 100644
--- a/sph/blake.c
+++ b/sph/blake.c
@@ -507,6 +507,55 @@ static const sph_u64 CB[16] = {
 
 #if SPH_COMPACT_BLAKE_32
 
+#define COMPRESS32r8   do { \
+		sph_u32 M[16]; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M[0x0] = sph_dec32be_aligned(buf +  0); \
+		M[0x1] = sph_dec32be_aligned(buf +  4); \
+		M[0x2] = sph_dec32be_aligned(buf +  8); \
+		M[0x3] = sph_dec32be_aligned(buf + 12); \
+		M[0x4] = sph_dec32be_aligned(buf + 16); \
+		M[0x5] = sph_dec32be_aligned(buf + 20); \
+		M[0x6] = sph_dec32be_aligned(buf + 24); \
+		M[0x7] = sph_dec32be_aligned(buf + 28); \
+		M[0x8] = sph_dec32be_aligned(buf + 32); \
+		M[0x9] = sph_dec32be_aligned(buf + 36); \
+		M[0xA] = sph_dec32be_aligned(buf + 40); \
+		M[0xB] = sph_dec32be_aligned(buf + 44); \
+		M[0xC] = sph_dec32be_aligned(buf + 48); \
+		M[0xD] = sph_dec32be_aligned(buf + 52); \
+		M[0xE] = sph_dec32be_aligned(buf + 56); \
+		M[0xF] = sph_dec32be_aligned(buf + 60); \
+		for (r = 0; r < 8; r ++) \
+			ROUND_S(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
 #define COMPRESS32   do { \
 		sph_u32 M[16]; \
 		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -558,6 +607,61 @@ static const sph_u64 CB[16] = {
 
 #else
 
+#define COMPRESS32r8   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
 #define COMPRESS32   do { \
 		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
 		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -831,6 +935,44 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
 	sc->ptr = ptr;
 }
 
+static void
+blake32r8(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32r8;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
 static void
 blake32_close(sph_blake_small_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -884,6 +1026,59 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be(out + (k << 2), sc->H[k]);
 }
 
+static void
+blake32r8_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32r8(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32r8(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32r8(sc, u.buf, 64);
+	}
+	out = (unsigned char *)dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
 #if SPH_64
 
 static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
@@ -1034,6 +1229,13 @@ sph_blake256(void *cc, const void *data, size_t len)
 	blake32((sph_blake_small_context *)cc, data, len);
 }
 
+/* see sph_blake.h */
+void
+sph_blake256r8(void *cc, const void *data, size_t len)
+{
+	blake32r8((sph_blake_small_context *)cc, data, len);
+}
+
 /* see sph_blake.h */
 void
 sph_blake256_close(void *cc, void *dst)
@@ -1041,6 +1243,13 @@ sph_blake256_close(void *cc, void *dst)
 	sph_blake256_addbits_and_close(cc, 0, 0, dst);
 }
 
+/* see sph_blake.h */
+void
+sph_blake256r8_close(void *cc, void *dst)
+{
+	sph_blake256r8_addbits_and_close(cc, 0, 0, dst);
+}
+
 /* see sph_blake.h */
 void
 sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
@@ -1049,6 +1258,14 @@ sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	sph_blake256_init(cc);
 }
 
+/* see sph_blake.h */
+void
+sph_blake256r8_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32r8_close((sph_blake_small_context *)cc, ub, n, dst, 8);
+	sph_blake256_init(cc);
+}
+
 #if SPH_64
 
 /* see sph_blake.h */
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
index c3829cad..8dea65ca 100644
--- a/sph/sph_blake.h
+++ b/sph/sph_blake.h
@@ -194,6 +194,7 @@ void sph_blake256_init(void *cc);
  * @param len    the input data length (in bytes)
  */
 void sph_blake256(void *cc, const void *data, size_t len);
+void sph_blake256r8(void *cc, const void *data, size_t len);
 
 /**
  * Terminate the current BLAKE-256 computation and output the result into
@@ -205,6 +206,7 @@ void sph_blake256(void *cc, const void *data, size_t len);
  * @param dst   the destination buffer
  */
 void sph_blake256_close(void *cc, void *dst);
+void sph_blake256r8_close(void *cc, void *dst);
 
 /**
  * Add a few additional bits (0 to 7) to the current computation, then
@@ -221,6 +223,8 @@ void sph_blake256_close(void *cc, void *dst);
  */
 void sph_blake256_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
+void sph_blake256r8_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
 
 #if SPH_64
 
diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj
index 6ec8a4cd..67679bb1 100644
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@@ -263,8 +263,10 @@
     <ClCompile Include="..\algorithm.c" />
     <ClCompile Include="..\algorithm\animecoin.c" />
     <ClCompile Include="..\algorithm\bitblock.c" />
+    <ClCompile Include="..\algorithm\credits.c" />
     <ClCompile Include="..\algorithm\lyra2.c" />
     <ClCompile Include="..\algorithm\lyra2re.c" />
+    <ClCompile Include="..\algorithm\lyra2rev2.c" />
     <ClCompile Include="..\algorithm\neoscrypt.c" />
     <ClCompile Include="..\algorithm\pluck.c" />
     <ClCompile Include="..\algorithm\sponge.c" />
@@ -328,11 +330,14 @@
     <ClInclude Include="..\algorithm.h" />
     <ClInclude Include="..\algorithm\animecoin.h" />
     <ClInclude Include="..\algorithm\bitblock.h" />
+    <ClInclude Include="..\algorithm\credits.h" />
     <ClInclude Include="..\algorithm\lyra2.h" />
     <ClInclude Include="..\algorithm\lyra2re.h" />
+    <ClInclude Include="..\algorithm\lyra2rev2.h" />
     <ClInclude Include="..\algorithm\neoscrypt.h" />
     <ClInclude Include="..\algorithm\pluck.h" />
     <ClInclude Include="..\algorithm\sponge.h" />
+    <ClInclude Include="..\algorithm\sysendian.h" />
     <ClInclude Include="..\algorithm\talkcoin.h" />
     <ClInclude Include="..\algorithm\whirlpoolx.h" />
     <ClInclude Include="..\algorithm\x14.h" />
@@ -365,6 +370,7 @@
     <ClInclude Include="..\algorithm\qubitcoin.h" />
     <ClInclude Include="..\algorithm\scrypt.h" />
     <ClInclude Include="..\algorithm\sifcoin.h" />
+    <ClInclude Include="..\sph\sha256_Y.h" />
     <ClInclude Include="..\sph\sph_blake.h" />
     <ClInclude Include="..\sph\sph_bmw.h" />
     <ClInclude Include="..\sph\sph_cubehash.h" />
diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters
index 02c26210..7866e29d 100644
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@@ -218,6 +218,12 @@
     <ClCompile Include="..\algorithm\pluck.c">
       <Filter>Source Files\algorithm</Filter>
     </ClCompile>
+    <ClCompile Include="..\algorithm\lyra2rev2.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
+    <ClCompile Include="..\algorithm\credits.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\adl.h">
@@ -415,6 +421,18 @@
     <ClInclude Include="..\algorithm\pluck.h">
       <Filter>Header Files\algorithm</Filter>
     </ClInclude>
+    <ClInclude Include="..\algorithm\lyra2rev2.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\algorithm\credits.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\sph\sha256_Y.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="..\algorithm\sysendian.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="README.txt" />