From 580676affbc7b5a2b0cc60356e643174bd3a3861 Mon Sep 17 00:00:00 2001
From: elbandi <bandispam@gmail.com>
Date: Mon, 9 Nov 2015 17:15:15 +0100
Subject: [PATCH] Get some algos from 'master' of
 https://github.com/djm34/sgminer

---
 Makefile.am                  |    3 +
 algorithm.c                  |  206 ++++-
 algorithm.h                  |    6 +-
 algorithm/credits.c          |  148 ++++
 algorithm/credits.h          |   10 +
 algorithm/lyra2.c            |   22 +-
 algorithm/lyra2.h            |    8 -
 algorithm/lyra2_old.c        |  208 ++++++
 algorithm/lyra2_old.h        |   50 ++
 algorithm/lyra2re.c          |   25 +-
 algorithm/lyra2re.h          |    2 +
 algorithm/lyra2re_old.c      |  169 +++++
 algorithm/lyra2re_old.h      |   10 +
 algorithm/pluck.h            |    2 +
 algorithm/sponge.c           |   22 +-
 algorithm/sponge.h           |    8 +-
 algorithm/sponge_old.c       |  405 ++++++++++
 algorithm/sponge_old.h       |   98 +++
 algorithm/sysendian.h        |  140 ++++
 algorithm/yescrypt-opt.c     | 1364 ++++++++++++++++++++++++++++++++++
 algorithm/yescrypt.c         |  128 ++++
 algorithm/yescrypt.h         |   10 +
 algorithm/yescrypt_core.h    |  376 ++++++++++
 algorithm/yescryptcommon.c   |  360 +++++++++
 driver-opencl.c              |   13 +-
 findnonce.c                  |    6 +-
 kernel/bmw256.cl             |  162 ++++
 kernel/credits.cl            |  232 ++++++
 kernel/cubehash256.cl        |  132 ++++
 kernel/lyra2rev2.cl          |  525 +++++++++++++
 kernel/lyra2v2.cl            |  184 +++++
 kernel/yescrypt-multi.cl     |  314 ++++++++
 kernel/yescrypt.cl           |  253 +++++++
 kernel/yescrypt_essential.cl |  760 +++++++++++++++++++
 miner.h                      |   27 +-
 ocl.c                        |  242 +++++-
 ocl.h                        |    5 +-
 sgminer.c                    |   26 +-
 sph/Makefile.am              |    2 +-
 sph/sha256_Y.c               |  418 +++++++++++
 sph/sha256_Y.h               |   63 ++
 41 files changed, 7083 insertions(+), 61 deletions(-)
 create mode 100644 algorithm/credits.c
 create mode 100644 algorithm/credits.h
 create mode 100644 algorithm/lyra2_old.c
 create mode 100644 algorithm/lyra2_old.h
 create mode 100644 algorithm/lyra2re_old.c
 create mode 100644 algorithm/lyra2re_old.h
 create mode 100644 algorithm/sponge_old.c
 create mode 100644 algorithm/sponge_old.h
 create mode 100644 algorithm/sysendian.h
 create mode 100644 algorithm/yescrypt-opt.c
 create mode 100644 algorithm/yescrypt.c
 create mode 100644 algorithm/yescrypt.h
 create mode 100644 algorithm/yescrypt_core.h
 create mode 100644 algorithm/yescryptcommon.c
 create mode 100644 kernel/bmw256.cl
 create mode 100644 kernel/credits.cl
 create mode 100644 kernel/cubehash256.cl
 create mode 100644 kernel/lyra2rev2.cl
 create mode 100644 kernel/lyra2v2.cl
 create mode 100644 kernel/yescrypt-multi.cl
 create mode 100644 kernel/yescrypt.cl
 create mode 100644 kernel/yescrypt_essential.cl
 create mode 100644 sph/sha256_Y.c
 create mode 100644 sph/sha256_Y.h

diff --git a/Makefile.am b/Makefile.am
index af6915e8..5da38cc5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -73,7 +73,10 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
 sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
 sgminer_SOURCES += algorithm/whirlpoolx.c algorithm/whirlpoolx.h
 sgminer_SOURCES += algorithm/lyra2re.c algorithm/lyra2re.h algorithm/lyra2.c algorithm/lyra2.h algorithm/sponge.c algorithm/sponge.h
+sgminer_SOURCES += algorithm/lyra2re_old.c algorithm/lyra2re_old.h algorithm/lyra2_old.c algorithm/lyra2_old.h algorithm/sponge_old.c algorithm/sponge_old.h
 sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h
+sgminer_SOURCES += algorithm/credits.c algorithm/credits.h
+sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h 
 
 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
 
diff --git a/algorithm.c b/algorithm.c
index fc0ee4a9..6acab924 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -33,7 +33,10 @@
 #include "algorithm/neoscrypt.h"
 #include "algorithm/whirlpoolx.h"
 #include "algorithm/lyra2re.h"
+#include "algorithm/lyra2re_old.h"
 #include "algorithm/pluck.h"
+#include "algorithm/yescrypt.h"
+#include "algorithm/credits.h"
 
 #include "compat.h"
 
@@ -42,6 +45,7 @@
 
 const char *algorithm_type_str[] = {
   "Unknown",
+  "Credits",
   "Scrypt",
   "NScrypt",
   "X11",
@@ -58,7 +62,10 @@ const char *algorithm_type_str[] = {
   "Neoscrypt",
   "WhirlpoolX",
   "Lyra2RE",
+  "Lyra2REv2"
   "Pluck"
+  "Yescrypt",
+  "Yescrypt-multi"
 };
 
 void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@@ -184,6 +191,125 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
   return status;
 }
 
+static cl_int queue_credits_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_ulong le_target;
+  cl_int status = 0;
+
+
+    // le_target = (*(cl_uint *)(blk->work->device_target + 24));
+  le_target = (cl_ulong)le64toh(((uint64_t *)blk->work->/*device_*/target)[3]);
+  //  le_target = (cl_uint)((uint32_t *)blk->work->target)[6];
+
+
+  memcpy(clState->cldata, blk->work->data, 168);
+//  flip168(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 168, clState->cldata, 0, NULL, NULL);
+
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+  CL_SET_ARG(blk->work->midstate);
+
+  return status;
+}
+
+static cl_int queue_yescrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_uint le_target;
+  cl_int status = 0;
+
+
+//  le_target = (*(cl_uint *)(blk->work->device_target + 28));
+  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
+//  le_target = (cl_uint)((uint32_t *)blk->work->target)[7];
+
+
+//  memcpy(clState->cldata, blk->work->data, 80);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->buffer1);
+  CL_SET_ARG(clState->buffer2);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
+static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+//  cl_kernel *kernel = &clState->kernel;
+  cl_kernel *kernel;
+  unsigned int num = 0;
+  cl_uint le_target;
+  cl_int status = 0;
+
+
+  //  le_target = (*(cl_uint *)(blk->work->device_target + 28));
+  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
+  memcpy(clState->cldata, blk->work->data, 80);
+//  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+//pbkdf and initial sha
+  kernel = &clState->kernel;
+
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->buffer1);
+  CL_SET_ARG(clState->buffer2);
+  CL_SET_ARG(clState->buffer3);
+  CL_SET_ARG(le_target);
+
+//inactive kernel
+  num = 0;
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_N(0,clState->buffer1);
+  CL_SET_ARG_N(1,clState->buffer2);
+//  CL_SET_ARG_N(3, clState->buffer3);
+
+//mix2_2
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8);
+  CL_SET_ARG_N(1,clState->buffer1);
+  CL_SET_ARG_N(2,clState->buffer2);
+  //mix2_2
+//inactive kernel
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
+  CL_SET_ARG_N(1, clState->buffer2);
+  //mix2_2
+
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8);
+  CL_SET_ARG_N(1, clState->buffer1);
+  CL_SET_ARG_N(2, clState->buffer2);
+
+  //inactive kernel
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
+  CL_SET_ARG_N(1, clState->buffer2);
+  //mix2_2
+
+
+//pbkdf and finalization
+    num=0;
+  CL_NEXTKERNEL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(clState->buffer2);
+  CL_SET_ARG(clState->buffer3);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
 static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel = &clState->kernel;
@@ -716,6 +842,60 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct
   return status;
 }
 
+static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel;
+  unsigned int num;
+  cl_int status = 0;
+  cl_ulong le_target;
+
+  //  le_target = *(cl_uint *)(blk->work->device_target + 28);
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+
+  // blake - search
+  kernel = &clState->kernel;
+  num = 0;
+  //  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->buffer1);
+  CL_SET_ARG(blk->work->blk.ctx_a);
+  CL_SET_ARG(blk->work->blk.ctx_b);
+  CL_SET_ARG(blk->work->blk.ctx_c);
+  CL_SET_ARG(blk->work->blk.ctx_d);
+  CL_SET_ARG(blk->work->blk.ctx_e);
+  CL_SET_ARG(blk->work->blk.ctx_f);
+  CL_SET_ARG(blk->work->blk.ctx_g);
+  CL_SET_ARG(blk->work->blk.ctx_h);
+  CL_SET_ARG(blk->work->blk.cty_a);
+  CL_SET_ARG(blk->work->blk.cty_b);
+  CL_SET_ARG(blk->work->blk.cty_c);
+
+  // keccak - search1
+  kernel = clState->extra_kernels;
+  CL_SET_ARG_0(clState->buffer1);
+  // cubehash - search2
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
+  // lyra - search3
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
+  CL_SET_ARG_N(1, clState->padbuffer8);
+  // skein -search4
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
+  // cubehash - search5
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
+  // bmw - search6
+  num = 0;
+  CL_NEXTKERNEL_SET_ARG(clState->buffer1);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+
+  return status;
+}
+
 static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
   cl_kernel *kernel = &clState->kernel;
@@ -757,6 +937,25 @@ static algorithm_settings_t algos[] = {
   { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options }
   A_PLUCK("pluck"),
 #undef A_PLUCK
+
+#define A_CREDITS(a) \
+  { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL}
+  A_CREDITS("credits"),
+#undef A_CREDITS
+
+
+
+#define A_YESCRYPT(a) \
+  { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
+  A_YESCRYPT("yescrypt"),
+#undef A_YESCRYPT
+
+#define A_YESCRYPT_MULTI(a) \
+  { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options}
+  A_YESCRYPT_MULTI("yescrypt-multi"),
+#undef A_YESCRYPT_MULTI
+
+
   // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
   { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
@@ -793,7 +992,10 @@ static algorithm_settings_t algos[] = {
 
   { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL },
 
-  { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2re_regenhash, queue_lyra2RE_kernel, gen_hash, NULL },
+  { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL },
+
+  { "lyra2rev2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options },
+
 
   // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
 #define A_FUGUE(a, b, c) \
@@ -877,8 +1079,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
   ALGO_ALIAS("nist5", "talkcoin-mod");
   ALGO_ALIAS("keccak", "maxcoin");
   ALGO_ALIAS("whirlpool", "whirlcoin");
-  ALGO_ALIAS("Lyra2RE", "lyra2re");
   ALGO_ALIAS("lyra2", "lyra2re");
+  ALGO_ALIAS("lyra2v2", "lyra2rev2");
 
 #undef ALGO_ALIAS
 #undef ALGO_ALIAS_NF
diff --git a/algorithm.h b/algorithm.h
index b2527d17..8b7185a4 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -13,6 +13,7 @@
 
 typedef enum {
   ALGO_UNK,
+  ALGO_CRE,
   ALGO_SCRYPT,
   ALGO_NSCRYPT,
   ALGO_X11,
@@ -29,7 +30,10 @@ typedef enum {
   ALGO_NEOSCRYPT,
   ALGO_WHIRLPOOLX,
   ALGO_LYRA2RE,
-  ALGO_PLUCK
+  ALGO_LYRA2REv2,
+  ALGO_PLUCK,
+  ALGO_YESCRYPT,
+  ALGO_YESCRYPT_MULTI,
 } algorithm_type_t;
 
 extern const char *algorithm_type_str[];
diff --git a/algorithm/credits.c b/algorithm/credits.c
new file mode 100644
index 00000000..b69514bc
--- /dev/null
+++ b/algorithm/credits.c
@@ -0,0 +1,148 @@
+/*-
+ * Copyright 2015 djm34 
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "sph/sph_sha2.h"
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+
+
+inline void credits_hash(void *state, const void *input)
+{
+	sph_sha256_context sha1, sha2;
+	uint32_t hash[8], hash2[8];
+
+	sph_sha256_init(&sha1);
+	sph_sha256(&sha1, input, 168);
+	sph_sha256_close(&sha1, hash);
+
+
+	sph_sha256_init(&sha2);
+	sph_sha256(&sha2, hash, 32);
+	sph_sha256_close(&sha2, hash2);
+
+	memcpy(state, hash2, 32);
+
+}
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+/* Used externally as confirmation of correct OCL code */
+int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[42], ohash[8];
+	printf("coming here credits test\n");
+
+	be32enc_vect(data, (const uint32_t *)pdata, 42);
+	data[35] = htobe32(nonce);
+	credits_hash((unsigned char*)data,(unsigned char*)ohash);
+
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+		(long unsigned int)Htarg,
+		(long unsigned int)diff1targ,
+		(long unsigned int)tmp_hash7);
+
+	if (tmp_hash7 > diff1targ)
+		return -1;
+
+	if (tmp_hash7 > Htarg)
+		return 0;
+
+	return 1;
+}
+
+void credits_regenhash(struct work *work)
+{
+        uint32_t data[42];
+        uint32_t *nonce = (uint32_t *)(work->data + 140);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+		
+        be32enc_vect(data, (const uint32_t *)work->data, 42);
+        data[35] = htobe32(*nonce);	
+
+		credits_hash((unsigned char*)ohash, (unsigned char*)data);
+        
+}
+
+
+bool scanhash_credits(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+	unsigned char *pdata, unsigned char __maybe_unused *phash1,
+	unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 140);
+	uint32_t data[42];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 35);
+	
+
+	while (1)
+	{
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[35] = (n);
+		credits_hash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx", (long unsigned int)ostate[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg))
+		{
+			((uint32_t *)pdata)[35] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart))
+		{
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
\ No newline at end of file
diff --git a/algorithm/credits.h b/algorithm/credits.h
new file mode 100644
index 00000000..9d74ad20
--- /dev/null
+++ b/algorithm/credits.h
@@ -0,0 +1,10 @@
+#ifndef CREDITS_H
+#define CREDITS_H
+
+#include "miner.h"
+
+
+extern int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void credits_regenhash(struct work *work);
+
+#endif /* CREDITS_H */
diff --git a/algorithm/lyra2.c b/algorithm/lyra2.c
index 6944b22f..42640e76 100644
--- a/algorithm/lyra2.c
+++ b/algorithm/lyra2.c
@@ -58,15 +58,19 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
     //========== Initializing the Memory Matrix and pointers to it =============//
     //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
     i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
-	uint64_t *wholeMatrix = (uint64_t*)malloc(i);
+    uint64_t *wholeMatrix = malloc(i);
     if (wholeMatrix == NULL) {
       return -1;
     }
 	memset(wholeMatrix, 0, i);
 
     //Allocates pointers to each row of the matrix
-	uint64_t **memMatrix = (uint64_t**)malloc(nRows * sizeof (uint64_t*));
+    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
     if (memMatrix == NULL) {
       return -1;
     }
@@ -118,7 +122,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
     //======================= Initializing the Sponge State ====================//
     //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-	uint64_t *state = (uint64_t*)malloc(16 * sizeof (uint64_t));
+    uint64_t *state = malloc(16 * sizeof (uint64_t));
     if (state == NULL) {
       return -1;
     }
@@ -130,16 +134,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
     ptrWord = wholeMatrix;
     for (i = 0; i < nBlocksInput; i++) {
       absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
-      ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
     }
 
     //Initializes M[0] and M[1]
-    reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
-    reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
+    reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+    reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
 
     do {
       //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
 
       //updates the value of row* (deterministically picked during Setup))
@@ -172,7 +176,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
   	    //------------------------------------------------------------------------------------------
 
   	    //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-  	    reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+  	    reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
   	    //update prev: it now points to the last row ever computed
   	    prev = row;
@@ -192,7 +196,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
     absorbBlock(state, memMatrix[rowa]);
 
     //Squeezes the key
-    squeeze(state, (unsigned char*)K, kLen);
+    squeeze(state, K, kLen);
     //==========================================================================/
 
     //========================= Freeing the memory =============================//
diff --git a/algorithm/lyra2.h b/algorithm/lyra2.h
index 13c7dbd3..798e6af1 100644
--- a/algorithm/lyra2.h
+++ b/algorithm/lyra2.h
@@ -37,14 +37,6 @@ typedef unsigned char byte;
         #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif
 
-#ifndef N_COLS
-        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
-#endif
-
-#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
-#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
-
-
 int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
 
 #endif /* LYRA2_H_ */
diff --git a/algorithm/lyra2_old.c b/algorithm/lyra2_old.c
new file mode 100644
index 00000000..3b381987
--- /dev/null
+++ b/algorithm/lyra2_old.c
@@ -0,0 +1,208 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "lyra2_old.h"
+#include "sponge_old.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) {
+
+    //============================= Basic variables ============================//
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+    int64_t i; //auxiliary iteration counter
+    //==========================================================================/
+
+    //========== Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+    i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+    uint64_t *wholeMatrix = malloc(i);
+    if (wholeMatrix == NULL) {
+      return -1;
+    }
+	memset(wholeMatrix, 0, i);
+
+    //Allocates pointers to each row of the matrix
+    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+    if (memMatrix == NULL) {
+      return -1;
+    }
+    //Places the pointers in the correct positions
+    uint64_t *ptrWord = wholeMatrix;
+    for (i = 0; i < nRows; i++) {
+      memMatrix[i] = ptrWord;
+      ptrWord += ROW_LEN_INT64;
+    }
+    //==========================================================================/
+
+    //============= Getting the password + salt + basil padded with 10*1 ===============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+    //==========================================================================/
+
+    //======================= Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+    uint64_t *state = malloc(16 * sizeof (uint64_t));
+    if (state == NULL) {
+      return -1;
+    }
+    initStateO(state);
+    //==========================================================================/
+
+    //================================ Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    ptrWord = wholeMatrix;
+    for (i = 0; i < nBlocksInput; i++) {
+      absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
+    }
+
+    //Initializes M[0] and M[1]
+    reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here
+    reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]);
+
+    do {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+      reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0) {
+      step = window + gap; //changes the step: approximately doubles its value
+      window *= 2; //doubles the size of the re-visitation window
+      gap = -gap; //inverts the modifier to the step
+    }
+
+    } while (row < nRows);
+    //==========================================================================/
+
+    //============================ Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for (tau = 1; tau <= timeCost; tau++) {
+    	//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+    	step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+    	do {
+  	    //Selects a pseudorandom index row*
+  	    //------------------------------------------------------------------------------------------
+  	    //rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+  	    rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+  	    //------------------------------------------------------------------------------------------
+
+  	    //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+  	    reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+
+  	    //update prev: it now points to the last row ever computed
+  	    prev = row;
+
+  	    //updates row: goes to the next row to be computed
+  	    //------------------------------------------------------------------------------------------
+  	    //row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+  	    row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+  	    //------------------------------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+    //==========================================================================/
+
+    //============================ Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlockO(state, memMatrix[rowa]);
+
+    //Squeezes the key
+    squeezeO(state, K, kLen);
+    //==========================================================================/
+
+    //========================= Freeing the memory =============================//
+    free(memMatrix);
+    free(wholeMatrix);
+
+    //Wiping out the sponge's internal state before freeing it
+    memset(state, 0, 16 * sizeof (uint64_t));
+    free(state);
+    //==========================================================================/
+
+    return 0;
+}
diff --git a/algorithm/lyra2_old.h b/algorithm/lyra2_old.h
new file mode 100644
index 00000000..9dbe5668
--- /dev/null
+++ b/algorithm/lyra2_old.h
@@ -0,0 +1,50 @@
+/**
+ * Header file for the Lyra2 Password Hashing Scheme (PHS).
+ * 
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ * 
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LYRA2OLD_H_
+#define LYRA2OLD_H_
+
+#include <stdint.h>
+
+typedef unsigned char byte;
+
+//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
+#define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
+#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
+
+
+#ifdef BLOCK_LEN_BITS
+        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
+#else   //default block lenght: 768 bits
+        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
+#endif
+
+#ifndef N_COLS
+        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
+#endif
+
+#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
+#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
+
+
+int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
+
+#endif /* LYRA2_H_ */
diff --git a/algorithm/lyra2re.c b/algorithm/lyra2re.c
index ba37094a..9d89853f 100644
--- a/algorithm/lyra2re.c
+++ b/algorithm/lyra2re.c
@@ -36,6 +36,8 @@
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_keccak.h" 
+#include "sph/sph_bmw.h"
+#include "sph/sph_cubehash.h"
 #include "lyra2.h"
 
 /*
@@ -55,9 +57,10 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
 inline void lyra2rehash(void *state, const void *input)
 {
     sph_blake256_context     ctx_blake;
-    sph_groestl256_context   ctx_groestl;
+    sph_bmw256_context       ctx_bmw;
     sph_keccak256_context    ctx_keccak;
     sph_skein256_context     ctx_skein;
+    sph_cubehash256_context  ctx_cube;
 
     uint32_t hashA[8], hashB[8];
 
@@ -72,17 +75,23 @@ inline void lyra2rehash(void *state, const void *input)
     sph_keccak256 (&ctx_keccak,hashA, 32);
     sph_keccak256_close(&ctx_keccak, hashB);
 
-	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+    sph_cubehash256_init(&ctx_cube);
+    sph_cubehash256(&ctx_cube, hashB, 32);
+    sph_cubehash256_close(&ctx_cube, hashA);
 
+    LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
 
-	sph_skein256_init(&ctx_skein);
-    sph_skein256 (&ctx_skein, hashA, 32);
-    sph_skein256_close(&ctx_skein, hashB);
+    sph_skein256_init(&ctx_skein);
+    sph_skein256 (&ctx_skein, hashB, 32);
+    sph_skein256_close(&ctx_skein, hashA);
 
+    sph_cubehash256_init(&ctx_cube);
+    sph_cubehash256(&ctx_cube, hashA, 32);
+    sph_cubehash256_close(&ctx_cube, hashB);
 
-    sph_groestl256_init(&ctx_groestl);
-    sph_groestl256 (&ctx_groestl, hashB, 32);
-    sph_groestl256_close(&ctx_groestl, hashA);
+    sph_bmw256_init(&ctx_bmw);
+    sph_bmw256 (&ctx_bmw, hashB, 32);
+    sph_bmw256_close(&ctx_bmw, hashA);
 
 //printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);
 
diff --git a/algorithm/lyra2re.h b/algorithm/lyra2re.h
index 8a58e747..8bc52ac4 100644
--- a/algorithm/lyra2re.h
+++ b/algorithm/lyra2re.h
@@ -2,6 +2,8 @@
 #define LYRA2RE_H
 
 #include "miner.h"
+#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent
+#define LYRA_SECBUF_SIZE (4) // (not used)
 
 extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget,
 			uint32_t nonce);
diff --git a/algorithm/lyra2re_old.c b/algorithm/lyra2re_old.c
new file mode 100644
index 00000000..3aa4be9a
--- /dev/null
+++ b/algorithm/lyra2re_old.c
@@ -0,0 +1,169 @@
+/*-
+ * Copyright 2014 James Lovejoy
+ * Copyright 2014 phm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_keccak.h" 
+#include "lyra2_old.h"
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+
+inline void lyra2rehash_old(void *state, const void *input)
+{
+    sph_blake256_context     ctx_blake;
+    sph_groestl256_context   ctx_groestl;
+    sph_keccak256_context    ctx_keccak;
+    sph_skein256_context     ctx_skein;
+
+    uint32_t hashA[8], hashB[8];
+
+    sph_blake256_init(&ctx_blake);
+    sph_blake256 (&ctx_blake, input, 80);
+    sph_blake256_close (&ctx_blake, hashA);
+
+
+
+
+    sph_keccak256_init(&ctx_keccak);
+    sph_keccak256 (&ctx_keccak,hashA, 32);
+    sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+
+	sph_skein256_init(&ctx_skein);
+    sph_skein256 (&ctx_skein, hashA, 32);
+    sph_skein256_close(&ctx_skein, hashB);
+
+
+    sph_groestl256_init(&ctx_groestl);
+    sph_groestl256 (&ctx_groestl, hashB, 32);
+    sph_groestl256_close(&ctx_groestl, hashA);
+
+//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);
+
+	memcpy(state, hashA, 32);
+}
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+
+/* Used externally as confirmation of correct OCL code */
+int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	lyra2rehash_old(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void lyra2reold_regenhash(struct work *work)
+{
+        uint32_t data[20];
+        uint32_t *nonce = (uint32_t *)(work->data + 76);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        be32enc_vect(data, (const uint32_t *)work->data, 19);
+        data[19] = htobe32(*nonce);
+        lyra2rehash_old(ohash, data);
+}
+
+bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	while(1) {
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[19] = (n);
+		lyra2rehash_old(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
+
diff --git a/algorithm/lyra2re_old.h b/algorithm/lyra2re_old.h
new file mode 100644
index 00000000..0788dfb3
--- /dev/null
+++ b/algorithm/lyra2re_old.h
@@ -0,0 +1,10 @@
+#ifndef LYRA2REOLD_H
+#define LYRA2REOLD_H
+
+#include "miner.h"
+
+extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget,
+			uint32_t nonce);
+extern void lyra2reold_regenhash(struct work *work);
+
+#endif /* LYRA2RE_H */
diff --git a/algorithm/pluck.h b/algorithm/pluck.h
index 7582554e..619eb013 100644
--- a/algorithm/pluck.h
+++ b/algorithm/pluck.h
@@ -3,6 +3,8 @@
 
 #include "miner.h"
 #define PLUCK_SCRATCHBUF_SIZE (128 * 1024)
+#define PLUCK_SECBUF_SIZE (64 * 1024)
+
 extern int pluck_test(unsigned char *pdata, const unsigned char *ptarget,
 			uint32_t nonce);
 extern void pluck_regenhash(struct work *work);
diff --git a/algorithm/sponge.c b/algorithm/sponge.c
index e717a508..c788952a 100644
--- a/algorithm/sponge.c
+++ b/algorithm/sponge.c
@@ -158,11 +158,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
  * @param state     The current state of the sponge 
  * @param rowOut    Row to receive the data squeezed
  */
-void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
-    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) {
+    uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
     int i;
     //M[row][C-1-col] = H.reduced_squeeze()    
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 	ptrWord[0] = state[0];
 	ptrWord[1] = state[1];
 	ptrWord[2] = state[2];
@@ -193,12 +193,12 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
  * @param rowIn		Row to feed the sponge
  * @param rowOut	Row to receive the sponge's output
  */
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
     int i;
 
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 
 	//Absorbing "M[prev][col]"
 	state[0]  ^= (ptrWordIn[0]);
@@ -253,13 +253,13 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
     uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
     int i;
 
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
 	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
@@ -327,13 +327,13 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
     uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
     uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
     int i;
 
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
diff --git a/algorithm/sponge.h b/algorithm/sponge.h
index 3fcff0d7..19822979 100644
--- a/algorithm/sponge.h
+++ b/algorithm/sponge.h
@@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]);
 
 //---- Squeezes
 void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
 
 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
 
 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 
 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);
diff --git a/algorithm/sponge_old.c b/algorithm/sponge_old.c
new file mode 100644
index 00000000..7152687f
--- /dev/null
+++ b/algorithm/sponge_old.c
@@ -0,0 +1,405 @@
+/**
+ * A simple implementation of Blake2b's internal permutation 
+ * in the form of a sponge.
+ * 
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ * 
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "sponge_old.h"
+#include "lyra2_old.h"
+
+
+
+/**
+ * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder 
+ * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
+ * typically have their internal state initialized with zeros, Blake2b's G function
+ * has a fixed point: if the internal state and message are both filled with zeros. the 
+ * resulting permutation will always be a block filled with zeros; this happens because 
+ * Blake2b does not use the constants originally employed in Blake2 inside its G function, 
+ * relying on the IV for avoiding possible fixed points.
+ * 
+ * @param state         The 1024-bit array to be initialized
+ */
+void initStateO(uint64_t state[/*16*/]) {
+    //First 512 bis are zeros
+    memset(state, 0, 64); 
+    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+    state[8] = blake2b_IV[0];
+    state[9] = blake2b_IV[1];
+    state[10] = blake2b_IV[2];
+    state[11] = blake2b_IV[3];
+    state[12] = blake2b_IV[4];
+    state[13] = blake2b_IV[5];
+    state[14] = blake2b_IV[6];
+    state[15] = blake2b_IV[7];
+}
+
+/**
+ * Execute Blake2b's G function, with all 12 rounds.
+ * 
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+static void blake2bLyra(uint64_t *v) {
+    ROUND_LYRA(0);
+    ROUND_LYRA(1);
+    ROUND_LYRA(2);
+    ROUND_LYRA(3);
+    ROUND_LYRA(4);
+    ROUND_LYRA(5);
+    ROUND_LYRA(6);
+    ROUND_LYRA(7);
+    ROUND_LYRA(8);
+    ROUND_LYRA(9);
+    ROUND_LYRA(10);
+    ROUND_LYRA(11);
+}
+
+/**
+ * Executes a reduced version of Blake2b's G function with only one round
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+static void reducedBlake2bLyra(uint64_t *v) {
+    ROUND_LYRA(0);
+}
+
+/**
+ * Performs a squeeze operation, using Blake2b's G function as the 
+ * internal permutation
+ * 
+ * @param state      The current state of the sponge 
+ * @param out        Array that will receive the data squeezed
+ * @param len        The number of bytes to be squeezed into the "out" array
+ */
+void squeezeO(uint64_t *state, byte *out, unsigned int len) {
+    int fullBlocks = len / BLOCK_LEN_BYTES;
+    byte *ptr = out;
+    int i;
+    //Squeezes full blocks
+    for (i = 0; i < fullBlocks; i++) {
+	memcpy(ptr, state, BLOCK_LEN_BYTES);
+	blake2bLyra(state);
+	ptr += BLOCK_LEN_BYTES;
+    }
+
+    //Squeezes remaining bytes
+    memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
+ * of type uint64_t), using Blake2b's G function as the internal permutation
+ * 
+ * @param state The current state of the sponge 
+ * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
+ */
+void absorbBlockO(uint64_t *state, const uint64_t *in) {
+    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
+    state[8] ^= in[8];
+    state[9] ^= in[9];
+    state[10] ^= in[10];
+    state[11] ^= in[11];
+
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 
+ * words of type uint64_t), using Blake2b's G function as the internal permutation
+ * 
+ * @param state The current state of the sponge 
+ * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
+ */
+void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) {
+    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
+
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
+}
+
+/** 
+ * Performs a reduced squeeze operation for a single row, from the highest to 
+ * the lowest index, using the reduced-round Blake2b's G function as the 
+ * internal permutation
+ * 
+ * @param state     The current state of the sponge 
+ * @param rowOut    Row to receive the data squeezed
+ */
+void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) {
+    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+    int i;
+    //M[row][C-1-col] = H.reduced_squeeze()    
+    for (i = 0; i < N_COLS; i++) {
+	ptrWord[0] = state[0];
+	ptrWord[1] = state[1];
+	ptrWord[2] = state[2];
+	ptrWord[3] = state[3];
+	ptrWord[4] = state[4];
+	ptrWord[5] = state[5];
+	ptrWord[6] = state[6];
+	ptrWord[7] = state[7];
+	ptrWord[8] = state[8];
+	ptrWord[9] = state[9];
+	ptrWord[10] = state[10];
+	ptrWord[11] = state[11];
+
+	//Goes to next block (column) that will receive the squeezed data
+	ptrWord -= BLOCK_LEN_INT64;
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+    }
+}
+
+/** 
+ * Performs a reduced duplex operation for a single row, from the highest to 
+ * the lowest index, using the reduced-round Blake2b's G function as the 
+ * internal permutation
+ * 
+ * @param state		The current state of the sponge 
+ * @param rowIn		Row to feed the sponge
+ * @param rowOut	Row to receive the sponge's output
+ */
+void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
+
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[prev][col]"
+	state[0]  ^= (ptrWordIn[0]);
+	state[1]  ^= (ptrWordIn[1]);
+	state[2]  ^= (ptrWordIn[2]);
+	state[3]  ^= (ptrWordIn[3]);
+	state[4]  ^= (ptrWordIn[4]);
+	state[5]  ^= (ptrWordIn[5]);
+	state[6]  ^= (ptrWordIn[6]);
+	state[7]  ^= (ptrWordIn[7]);
+	state[8]  ^= (ptrWordIn[8]);
+	state[9]  ^= (ptrWordIn[9]);
+	state[10] ^= (ptrWordIn[10]);
+	state[11] ^= (ptrWordIn[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[row][C-1-col] = M[prev][col] XOR rand
+	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+	
+	
+	//Input: next column (i.e., next block in sequence)
+	ptrWordIn += BLOCK_LEN_INT64;
+	//Output: goes to previous column
+	ptrWordOut -= BLOCK_LEN_INT64;
+    }
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., 
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make 
+ * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and 
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit 
+ * rotation to the left and N_COLS is a system parameter.
+ *
+ * @param state          The current state of the sponge 
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
+
+    for (i = 0; i < N_COLS; i++) {
+	//Absorbing "M[prev] [+] M[row*]"
+	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[row][col] = M[prev][col] XOR rand
+	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+	
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0]  ^= state[11];
+	ptrWordInOut[1]  ^= state[0];
+	ptrWordInOut[2]  ^= state[1];
+	ptrWordInOut[3]  ^= state[2];
+	ptrWordInOut[4]  ^= state[3];
+	ptrWordInOut[5]  ^= state[4];
+	ptrWordInOut[6]  ^= state[5];
+	ptrWordInOut[7]  ^= state[6];
+	ptrWordInOut[8]  ^= state[7];
+	ptrWordInOut[9]  ^= state[8];
+	ptrWordInOut[10] ^= state[9];
+	ptrWordInOut[11] ^= state[10];
+
+	//Inputs: next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	//Output: goes to previous column
+	ptrWordOut -= BLOCK_LEN_INT64;
+    }
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., 
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make 
+ * "M[rowOut][col] = M[rowOut][col] XOR rand" and 
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit 
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge 
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
+
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[prev] [+] M[row*]"
+	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[rowOut][col] = M[rowOut][col] XOR rand
+	ptrWordOut[0] ^= state[0];
+	ptrWordOut[1] ^= state[1];
+	ptrWordOut[2] ^= state[2];
+	ptrWordOut[3] ^= state[3];
+	ptrWordOut[4] ^= state[4];
+	ptrWordOut[5] ^= state[5];
+	ptrWordOut[6] ^= state[6];
+	ptrWordOut[7] ^= state[7];
+	ptrWordOut[8] ^= state[8];
+	ptrWordOut[9] ^= state[9];
+	ptrWordOut[10] ^= state[10];
+	ptrWordOut[11] ^= state[11];
+
+	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[11];
+	ptrWordInOut[1] ^= state[0];
+	ptrWordInOut[2] ^= state[1];
+	ptrWordInOut[3] ^= state[2];
+	ptrWordInOut[4] ^= state[3];
+	ptrWordInOut[5] ^= state[4];
+	ptrWordInOut[6] ^= state[5];
+	ptrWordInOut[7] ^= state[6];
+	ptrWordInOut[8] ^= state[7];
+	ptrWordInOut[9] ^= state[8];
+	ptrWordInOut[10] ^= state[9];
+	ptrWordInOut[11] ^= state[10];
+
+	//Goes to next block
+	ptrWordOut += BLOCK_LEN_INT64;
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ Prints an array of unsigned chars
+ */
+void printArrayO(unsigned char *array, unsigned int size, char *name) {
+    int i;
+    printf("%s: ", name);
+    for (i = 0; i < size; i++) {
+	printf("%2x|", array[i]);
+    }
+    printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/algorithm/sponge_old.h b/algorithm/sponge_old.h
new file mode 100644
index 00000000..c23781d3
--- /dev/null
+++ b/algorithm/sponge_old.h
@@ -0,0 +1,98 @@
+/**
+ * Header file for Blake2b's internal permutation in the form of a sponge. 
+ * This code is based on the original Blake2b's implementation provided by 
+ * Samuel Neves (https://blake2.net/)
+ * 
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ * 
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPONGEOLD_H_
+#define SPONGEOLD_H_
+
+#include <stdint.h>
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
+static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/*Blake2b's rotation*/
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/*Blake2b's G function*/
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
+  } while(0)
+
+
+/*One Round of the Blake2b's compression function*/
+#define ROUND_LYRA(r)  \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+
+
+//---- Housekeeping
+void initStateO(uint64_t state[/*16*/]);
+
+//---- Squeezes
+void squeezeO(uint64_t *state, unsigned char *out, unsigned int len);
+void reducedSqueezeRow0O(uint64_t* state, uint64_t* row);
+
+//---- Absorbs
+void absorbBlockO(uint64_t *state, const uint64_t *in);
+void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in);
+
+//---- Duplexes
+void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
+void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+
+//---- Misc
+void printArrayO(unsigned char *array, unsigned int size, char *name);
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#endif /* SPONGE_H_ */
diff --git a/algorithm/sysendian.h b/algorithm/sysendian.h
new file mode 100644
index 00000000..31ac985f
--- /dev/null
+++ b/algorithm/sysendian.h
@@ -0,0 +1,140 @@
+/*-
+ * Copyright 2007-2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _SYSENDIAN_H_
+#define _SYSENDIAN_H_
+
+/* If we don't have be64enc, the <sys/endian.h> we have isn't usable. */
+#if !HAVE_DECL_BE64ENC
+#undef HAVE_SYS_ENDIAN_H
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+
+#include <sys/endian.h>
+
+#else
+
+#include <stdint.h>
+
+#if !HAVE_DECL_LE32DEC
+static  uint32_t le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+		((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_BE32ENC
+static  void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_BE32DEC
+static uint32_t be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+		((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_LE32ENC
+static  void le32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+#endif
+
+static  uint64_t
+be64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) +
+	    ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) +
+	    ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) +
+	    ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56));
+}
+
+static void
+be64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[7] = x & 0xff;
+	p[6] = (x >> 8) & 0xff;
+	p[5] = (x >> 16) & 0xff;
+	p[4] = (x >> 24) & 0xff;
+	p[3] = (x >> 32) & 0xff;
+	p[2] = (x >> 40) & 0xff;
+	p[1] = (x >> 48) & 0xff;
+	p[0] = (x >> 56) & 0xff;
+}
+
+
+
+static  uint64_t
+le64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) +
+	    ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) +
+	    ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) +
+	    ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56));
+}
+
+static  void
+le64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+	p[4] = (x >> 32) & 0xff;
+	p[5] = (x >> 40) & 0xff;
+	p[6] = (x >> 48) & 0xff;
+	p[7] = (x >> 56) & 0xff;
+}
+#endif /* !HAVE_SYS_ENDIAN_H */
+
+#endif /* !_SYSENDIAN_H_ */
diff --git a/algorithm/yescrypt-opt.c b/algorithm/yescrypt-opt.c
new file mode 100644
index 00000000..b54be469
--- /dev/null
+++ b/algorithm/yescrypt-opt.c
@@ -0,0 +1,1364 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifdef __i386__
+#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Enable at least SSE2 in the C compiler and use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (portability to older CPUs or testing)."
+#elif defined(__x86_64__)
+#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (for testing only)."
+#endif
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "algorithm/yescrypt_core.h"
+#include "sph/sha256_Y.h"
+#include "algorithm/sysendian.h"
+
+// #include "sph/yescrypt-platform.c"
+#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
+
+#ifdef __x86_64__
+#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
+#else
+#undef HUGEPAGE_SIZE
+#endif
+
+
+static void *
+alloc_region(yescrypt_region_t * region, size_t size)
+{
+	size_t base_size = size;
+	uint8_t * base, *aligned;
+#ifdef MAP_ANON
+	int flags =
+#ifdef MAP_NOCORE
+		MAP_NOCORE |
+#endif
+		MAP_ANON | MAP_PRIVATE;
+#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
+	size_t new_size = size;
+	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
+	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
+		flags |= MAP_HUGETLB;
+		/*
+		* Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
+		* huge page size, so let's round up to huge page size here.
+		*/
+		new_size = size + hugepage_mask;
+		new_size &= ~hugepage_mask;
+	}
+	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (base != MAP_FAILED) {
+		base_size = new_size;
+	}
+	else
+		if (flags & MAP_HUGETLB) {
+		flags &= ~MAP_HUGETLB;
+		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+		}
+
+#else
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (base == MAP_FAILED)
+		base = NULL;
+	aligned = base;
+#elif defined(HAVE_POSIX_MEMALIGN)
+	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
+		base = NULL;
+	aligned = base;
+#else
+	base = aligned = NULL;
+	if (size + 63 < size) {
+		errno = ENOMEM;
+	}
+	else if ((base = malloc(size + 63)) != NULL) {
+		aligned = base + 63;
+		aligned -= (uintptr_t)aligned & 63;
+	}
+#endif
+	region->base = base;
+	region->aligned = aligned;
+	region->base_size = base ? base_size : 0;
+	region->aligned_size = base ? size : 0;
+	return aligned;
+}
+
+static  void init_region(yescrypt_region_t * region)
+{
+	region->base = region->aligned = NULL;
+	region->base_size = region->aligned_size = 0;
+}
+
+static int
+free_region(yescrypt_region_t * region)
+{
+	if (region->base) {
+#ifdef MAP_ANON
+		if (munmap(region->base, region->base_size))
+			return -1;
+#else
+		free(region->base);
+#endif
+	}
+	init_region(region);
+	return 0;
+}
+
+int
+yescrypt_init_shared(yescrypt_shared_t * shared,
+const uint8_t * param, size_t paramlen,
+uint64_t N, uint32_t r, uint32_t p,
+yescrypt_init_shared_flags_t flags, uint32_t mask,
+uint8_t * buf, size_t buflen)
+{
+	yescrypt_shared1_t * shared1 = &shared->shared1;
+	yescrypt_shared_t dummy, half1, half2;
+	//    yescrypt_shared_t * half2;
+	uint8_t salt[32];
+
+	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
+		if (!shared1->aligned || !shared1->aligned_size)
+			return -1;
+	}
+	else {
+		init_region(shared1);
+	}
+	shared->mask1 = 1;
+	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
+		return 0;
+
+	init_region(&dummy.shared1);
+	dummy.mask1 = 1;
+	if (yescrypt_kdf(&dummy, shared1,
+		param, paramlen, NULL, 0, N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		salt, sizeof(salt)))
+		goto out;
+
+	half1 = half2 = *shared;
+	half1.shared1.aligned_size /= 2;
+	half2.shared1.aligned_size = half1.shared1.aligned_size;
+	half2.shared1.aligned = (char*)half2.shared1.aligned + half1.shared1.aligned_size;
+
+	N /= 2;
+
+	if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
+		salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half2, &half1.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half1, &half2.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		buf, buflen))
+		goto out;
+
+	shared->mask1 = mask;
+
+	return 0;
+
+out:
+	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
+		free_region(shared1);
+	return -1;
+}
+
+int
+yescrypt_free_shared(yescrypt_shared_t * shared)
+{
+	return free_region(&shared->shared1);
+}
+
+int
+yescrypt_init_local(yescrypt_local_t * local)
+{
+	init_region(local);
+	return 0;
+}
+
+int
+yescrypt_free_local(yescrypt_local_t * local)
+{
+	return free_region(local);
+}
+
+
+static void
+blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ = *src++; *dest++ = *src++;
+		*dest++ = *src++; *dest++ = *src++;
+	} while (count -= 4);
+};
+
+static void
+blkxor(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ ^= *src++; *dest++ ^= *src++;
+		*dest++ ^= *src++; *dest++ ^= *src++;
+	} while (count -= 4);
+};
+
+typedef union {
+	uint32_t w[16];
+	uint64_t d[8];
+} salsa20_blk_t;
+
+static  void
+salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	COMBINE(0, 0, 2)
+	COMBINE(1, 5, 7)
+	COMBINE(2, 2, 4)
+	COMBINE(3, 7, 1)
+	COMBINE(4, 4, 6)
+	COMBINE(5, 1, 3)
+	COMBINE(6, 6, 0)
+	COMBINE(7, 3, 5)
+#undef COMBINE
+}
+
+static void
+salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->w[out * 2] = Bin->d[in1]; \
+	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	COMBINE(0, 0, 6)
+	COMBINE(1, 5, 3)
+	COMBINE(2, 2, 0)
+	COMBINE(3, 7, 5)
+	COMBINE(4, 4, 2)
+	COMBINE(5, 1, 7)
+	COMBINE(6, 6, 4)
+	COMBINE(7, 3, 1)
+#undef COMBINE
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+
+static void
+salsa20_8(uint64_t B[8])
+{
+	size_t i;
+	salsa20_blk_t X;
+
+#define x X.w
+
+	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
+
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+#undef x
+
+	{
+		salsa20_blk_t Y;
+		salsa20_simd_shuffle(&X, &Y);
+		for (i = 0; i < 16; i += 4) {
+			((salsa20_blk_t *)B)->w[i] += Y.w[i];
+			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
+			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
+			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
+		}
+	}
+}
+
+/**
+ * blockmix_salsa8(Bin, Bout, X, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.  The
+ * temporary space X must be 64 bytes.
+ */
+static void
+blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
+{
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2 * r; i += 2) {
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8], 8);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4], X, 8);
+
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8 + 8], 8);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4 + r * 8], X, 8);
+	}
+
+}
+
+/* These are tunable */
+#define S_BITS 8
+#define S_SIMD 2
+#define S_P 4
+#define S_ROUNDS 6
+
+/* Number of S-boxes.  Not tunable, hard-coded in a few places. */
+#define S_N 2
+
+/* Derived values.  Not tunable on their own. */
+#define S_SIZE1 (1 << S_BITS)
+#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
+#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
+#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
+#define S_P_SIZE (S_P * S_SIMD)
+#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
+
+/**
+ * pwxform(B):
+ * Transform the provided block using the provided S-boxes.
+ */
+
+static void
+block_pwxform(uint64_t * B, const uint64_t * S)
+{
+	uint64_t(*X)[S_SIMD] = (uint64_t(*)[S_SIMD])B;
+	const uint8_t *S0 = (const uint8_t *)S;
+	const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
+	size_t i, j;
+
+	for (j = 0; j < S_P; j++) {
+
+		uint64_t *Xj = X[j];
+		uint64_t x0 = Xj[0];
+		uint64_t x1 = Xj[1];
+
+		for (i = 0; i < S_ROUNDS; i++) {
+			uint64_t x = x0 & S_MASK2;
+			const uint64_t *p0, *p1;
+
+			p0 = (const uint64_t *)(S0 + (uint32_t)x);
+			p1 = (const uint64_t *)(S1 + (x >> 32));
+	
+			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
+			x0 += p0[0];
+			x0 ^= p1[0];
+
+			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
+			x1 += p0[1];
+			x1 ^= p1[1];
+		}
+		Xj[0] = x0;
+		Xj[1] = x1;
+	}
+
+
+
+}
+
+
+/**
+ * blockmix_pwxform(Bin, Bout, S, r):
+ * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ *
+ * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
+ * need to refer to both functions via the same function pointers.
+ */
+static void
+blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
+{
+	size_t r1, r2, i;
+   // S_P_SIZE = 8;
+	/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
+
+	r1 = r * 128 / (S_P_SIZE * 8);
+	/* X <-- B_{r1 - 1} */
+	blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
+
+	/* X <-- X \xor B_i */
+	blkxor(Bout, Bin, S_P_SIZE);
+
+	/* X <-- H'(X) */
+	/* B'_i <-- X */
+	block_pwxform(Bout, S);
+
+	/* for i = 0 to r1 - 1 do */
+	for (i = 1; i < r1; i++) {
+		/* X <-- X \xor B_i */
+		blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],S_P_SIZE);
+		blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
+
+		/* X <-- H'(X) */
+		/* B'_i <-- X */
+		block_pwxform(&Bout[i * S_P_SIZE], S);
+	}
+	
+	/* Handle partial blocks */
+	if (i * S_P_SIZE < r * 16) {
+		blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],r * 16 - i * S_P_SIZE);
+}
+
+	i = (r1 - 1) * S_P_SIZE / 8;
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r2 = r * 2;
+
+	/* B'_i <-- H(B'_i) */
+	salsa20_8(&Bout[i * 8]);
+
+
+	i++;
+/// not used yescrypt
+
+	for (; i < r2; i++) {
+		/* B'_i <-- H(B'_i \xor B'_{i-1}) */
+		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
+		salsa20_8(&Bout[i * 8]);
+	}
+}
+
+
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static  uint64_t
+integerify(const uint64_t * B, size_t r)
+{
+/*
+ * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
+ * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
+ * in host byte order, as it should be.
+ */
+	const uint64_t * X = &B[(2 * r - 1) * 8];
+	uint32_t lo = X[0];
+	uint32_t hi = X[6] >> 32;
+	return ((uint64_t)hi << 32) + lo;
+}
+
+/**
+ * smix1(B, r, N, flags, V, NROM, shared, XY, S):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be even and
+ * no smaller than 2.
+ */
+static void
+smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8);
+	const uint64_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1;
+	size_t s = 16 * r;
+	uint64_t * X = V;
+	uint64_t * Y = &XY[s];
+	uint64_t * Z = S ? S : &XY[2 * s];
+	uint64_t n, i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	/* 3: V_i <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		for (k = 0; k < 16; k++) 
+			tmp->w[k] = le32dec(&src->w[k]);
+                                 
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+
+	blockmix(X, Y, Z, r);
+	blkcpy(&V[s], Y, s);
+	X = XY;
+
+	if (NROM && (VROM_mask & 1)) {
+		if ((1 & VROM_mask) == 1) {
+			/* j <-- Integerify(X) mod NROM */
+			j = integerify(Y, r) & (NROM - 1);
+
+			/* X <-- H(X \xor VROM_j) */
+			blkxor(Y, &VROM[j * s], s);
+		}
+	
+		blockmix(Y, X, Z, r);
+
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if ((i & (i - 1)) == 0)
+				n <<= 1;
+
+			/* j <-- Wrap(Integerify(X), i) */
+			j = integerify(X, r) & (n - 1);
+			j += i - n;
+
+			/* X <-- X \xor V_j */
+			blkxor(X, &V[j * s], s);
+
+			/* 4: X <-- H(X) */
+			blockmix(X, Y, Z, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			j = integerify(Y, r);
+			if (((i + 1) & VROM_mask) == 1) {
+				/* j <-- Integerify(X) mod NROM */
+				j &= NROM - 1;
+
+				/* X <-- H(X \xor VROM_j) */
+				blkxor(Y, &VROM[j * s], s);
+			} else {
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i + 1 - n;
+
+				/* X <-- H(X \xor V_j) */
+				blkxor(Y, &V[j * s], s);
+			}
+
+			blockmix(Y, X, Z, r);
+		}
+	} else {
+		yescrypt_flags_t rw = flags & YESCRYPT_RW;
+		/* 4: X <-- H(X) */
+		blockmix(Y, X, Z, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if (rw) {
+				if ((i & (i - 1)) == 0)
+					n <<= 1;
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j = integerify(X, r) & (n - 1);
+				j += i - n;
+				
+				/* X <-- X \xor V_j */
+				blkxor(X, &V[j * s], s);
+			}
+
+			/* 4: X <-- H(X) */
+			blockmix(X, Y, Z, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			if (rw) {
+				/* j <-- Wrap(Integerify(X), i) */
+				j = integerify(Y, r) & (n - 1);
+				j += (i + 1) - n;
+				
+
+				/* X <-- X \xor V_j */
+				blkxor(Y, &V[j * s], s);
+			}
+
+			/* 4: X <-- H(X) */
+			blockmix(Y, X, Z, r);
+		}
+	}
+
+	/* B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+
+
+/**
+ * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be a
+ * power of 2 greater than 1.  The value Nloop must be even.
+ */
+static void
+smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
+	yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	
+	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
+		(S ? blockmix_pwxform : blockmix_salsa8);
+	const uint64_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1 | 1;
+	size_t s = 16 * r;
+	yescrypt_flags_t rw = flags & YESCRYPT_RW;
+	uint64_t * X = XY;
+	uint64_t * Y = &XY[s];
+	uint64_t * Z = S ? S : &XY[2 * s];
+	uint64_t i, j;
+	size_t k;
+
+	if (Nloop == 0)
+		return;
+
+	/* X <-- B' */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+	if (NROM) {
+
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix(X, Y, Z, r);
+
+			j = integerify(Y, r);
+			if (((i + 1) & VROM_mask) == 1) {
+				/* j <-- Integerify(X) mod NROM */
+				j &= NROM - 1;
+
+				/* X <-- H(X \xor VROM_j) */
+				blkxor(Y, &VROM[j * s], s);
+			} else {
+				/* 7: j <-- Integerify(X) mod N */
+				j &= N - 1;
+
+				/* 8: X <-- H(X \xor V_j) */
+				blkxor(Y, &V[j * s], s);
+				/* V_j <-- Xprev \xor V_j */
+				if (rw)
+					blkcpy(&V[j * s], Y, s);
+			}
+
+			blockmix(Y, X, Z, r);
+		}
+	} else {
+
+		/* 6: for i = 0 to N - 1 do */
+		i = Nloop / 2;
+		do {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix(X, Y, Z, r);
+
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(Y, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(Y, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], Y, s);
+			blockmix(Y, X, Z, r);
+		} while (--i);
+	}
+
+	/* 10: B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+
+
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint64_t
+p2floor(uint64_t x)
+{
+	uint64_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
+ * required with OpenMP-enabled builds).  The value N must be a power of 2
+ * greater than 1.
+ */
+static void
+smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
+	yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	size_t s = 16 * r;
+	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		} else {
+			Nloop_all *= t - 1;
+		}
+	} else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED)
+		Nloop_rw = Nloop_all;
+	else if (flags & YESCRYPT_RW)
+		Nloop_rw = Nloop_all / p;
+
+	Nchunk &= ~(uint64_t)1; /* round down to even */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	Nloop_rw &= ~(uint64_t)1; /* round down to even */
+
+
+	for (i = 0; i < p; i++) {
+		uint64_t Vchunk = i * Nchunk;
+		uint64_t * Bp = &B[i * s];
+		uint64_t * Vp = &V[Vchunk * s];
+		uint64_t * XYp = XY;
+
+	uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+
+		if (Sp) 
+			smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL);
+
+	
+
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) 
+			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
+
+
+			smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp);
+
+
+
+	}
+	if (Nloop_all > Nloop_rw) {
+
+		for (i = 0; i < p; i++) {
+			uint64_t * Bp = &B[i * s];
+
+			uint64_t * XYp = XY;
+
+			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+			smix2(Bp, r, N, Nloop_all - Nloop_rw,flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
+
+		}
+	}
+
+
+
+
+}
+
+static void
+smix_old(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
+yescrypt_flags_t flags,
+uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+uint64_t * XY, uint64_t * S)
+{
+	size_t s = 16 * r;
+	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		}
+		else {
+			Nloop_all *= t - 1;
+		}
+	}
+	else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED)
+		Nloop_rw = Nloop_all;
+	else if (flags & YESCRYPT_RW)
+		Nloop_rw = Nloop_all / p;
+
+	Nchunk &= ~(uint64_t)1; /* round down to even */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	Nloop_rw &= ~(uint64_t)1; /* round down to even */
+
+
+	for (i = 0; i < p; i++) {
+		uint64_t Vchunk = i * Nchunk;
+		uint64_t * Bp = &B[i * s];
+		uint64_t * Vp = &V[Vchunk * s];
+		uint64_t * XYp = XY;
+
+		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+
+		if (Sp) {
+			smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM, Sp, NROM, shared, XYp, NULL);
+
+
+		}
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) {
+			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
+		}
+
+
+		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp);
+	}
+	
+	if (Nloop_all > Nloop_rw) {
+
+		for (i = 0; i < p; i++) {
+			uint64_t * Bp = &B[i * s];
+
+			uint64_t * XYp = XY;
+
+			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+			smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
+		}
+	}
+}
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters r, p, and buflen must satisfy
+ * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
+ * of 2 greater than 1.
+ *
+ * t controls computation time while not affecting peak memory usage.  shared
+ * and flags may request special modes as described in yescrypt.h.  local is
+ * the thread-local data structure, allowing to preserve and reuse a memory
+ * allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int
+yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+	const uint8_t * passwd, size_t passwdlen,
+	const uint8_t * salt, size_t saltlen,
+	uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+	uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	size_t B_size, V_size, XY_size, need;
+	uint64_t * B, * V, * XY, * S;
+	uint64_t sha256[4];
+
+	/*
+	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
+	 * so don't let it have side-effects.  Without this adjustment, it'd
+	 * enable the SHA-256 password pre-hashing and output post-hashing,
+	 * because any deviation from classic scrypt implies those.
+	 */
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+#if S_MIN_R > 1
+	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
+		errno = EINVAL;
+		return -1;
+	}
+#endif
+	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
+#if SIZE_MAX / 256 <= UINT32_MAX
+		(r > SIZE_MAX / 256) ||
+#endif
+		(N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
+		errno = EFBIG;
+		return -1;
+	}
+
+	if ((flags & YESCRYPT_PWXFORM) &&
+		p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	NROM = 0;
+	if (shared->shared1.aligned) {
+		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+			!(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+				local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (uint64_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r + 64;
+
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_PWXFORM) {
+		size_t S_size = S_SIZE_ALL * sizeof(*S);
+
+		if (flags & YESCRYPT_PARALLEL_SMIX)
+			S_size *= p;
+
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint64_t *)tmp.aligned;
+		XY = (uint64_t *)((uint8_t *)B + B_size);
+	} else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint64_t *)local->aligned;
+		V = (uint64_t *)((uint8_t *)B + B_size);
+		XY = (uint64_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_PWXFORM)
+		S = (uint64_t *)((uint8_t *)XY + XY_size);
+
+
+	if (t || flags) {
+		SHA256_CTX_Y ctx;
+		SHA256_Init_Y(&ctx);
+		SHA256_Update_Y(&ctx, passwd, passwdlen);
+		SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		passwd = (uint8_t *)sha256;
+		passwdlen = sizeof(sha256);
+	}
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,(uint8_t *)B, B_size);
+
+	if (t || flags) 
+	{
+		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+	}
+	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
+		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
+	} else {
+		uint32_t i;
+		/* 2: for i = 0 to p - 1 do */
+		for (i = 0; i < p; i++) {
+			/* 3: B_i <-- MF(B_i, N) */
+			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, NROM, shared, XY, S);
+		}
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+
+	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
+	/*
+	 * Except when computing classic scrypt, allow all computation so far
+	 * to be performed on the client.  The final steps below match those of
+	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	 */
+	if ((t || flags) && buflen == sizeof(sha256)) {
+		/* Compute ClientKey */
+
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, buf, buflen);
+			HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
+			HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
+			SHA256_Final_Y(buf, &ctx);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
+int
+yescrypt_kdf_old(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+const uint8_t * passwd, size_t passwdlen,
+const uint8_t * salt, size_t saltlen,
+uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	size_t B_size, V_size, XY_size, need;
+	uint64_t * B, *V, *XY, *S;
+	uint64_t sha256[4];
+
+	/*
+	* YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
+	* so don't let it have side-effects.  Without this adjustment, it'd
+	* enable the SHA-256 password pre-hashing and output post-hashing,
+	* because any deviation from classic scrypt implies those.
+	*/
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r)* (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+#if S_MIN_R > 1
+	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
+		errno = EINVAL;
+		return -1;
+	}
+#endif
+	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
+#if SIZE_MAX / 256 <= UINT32_MAX
+		(r > SIZE_MAX / 256) ||
+#endif
+		(N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
+		errno = EFBIG;
+		return -1;
+	}
+
+	if ((flags & YESCRYPT_PWXFORM) &&
+		p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	NROM = 0;
+	if (shared->shared1.aligned) {
+		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+			!(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+				local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (uint64_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r + 64;
+
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_PWXFORM) {
+		size_t S_size = S_SIZE_ALL * sizeof(*S);
+
+		if (flags & YESCRYPT_PARALLEL_SMIX)
+			S_size *= p;
+
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint64_t *)tmp.aligned;
+		XY = (uint64_t *)((uint8_t *)B + B_size);
+	}
+	else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint64_t *)local->aligned;
+		V = (uint64_t *)((uint8_t *)B + B_size);
+		XY = (uint64_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_PWXFORM)
+		S = (uint64_t *)((uint8_t *)XY + XY_size);
+
+
+	if (t || flags) {
+		SHA256_CTX_Y ctx;
+		SHA256_Init_Y(&ctx);
+		SHA256_Update_Y(&ctx, passwd, passwdlen);
+		SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		passwd = (uint8_t *)sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, (uint8_t *)B, B_size);
+
+
+	if (t || flags)
+	{
+		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+	}
+		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
+
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
+
+	/*
+	* Except when computing classic scrypt, allow all computation so far
+	* to be performed on the client.  The final steps below match those of
+	* SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	* far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	* SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	*/
+	if ((t || flags) && buflen == sizeof(sha256)) {
+		/* Compute ClientKey */
+
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, buf, buflen);
+			HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
+			HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
+			SHA256_Final_Y(buf, &ctx);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
diff --git a/algorithm/yescrypt.c b/algorithm/yescrypt.c
new file mode 100644
index 00000000..de00d0f3
--- /dev/null
+++ b/algorithm/yescrypt.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright 2015 djm34 
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "algorithm/yescrypt_core.h"
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = htobe32(src[i]);
+}
+
+/* Used externally as confirmation of correct OCL code */
+int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[20], ohash[8];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = htobe32(nonce);
+	yescrypt_hash((unsigned char*)data,(unsigned char*)ohash);
+
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+		(long unsigned int)Htarg,
+		(long unsigned int)diff1targ,
+		(long unsigned int)tmp_hash7);
+
+	if (tmp_hash7 > diff1targ)
+		return -1;
+
+	if (tmp_hash7 > Htarg)
+		return 0;
+
+	return 1;
+}
+
+void yescrypt_regenhash(struct work *work)
+{
+        uint32_t data[20];
+        uint32_t *nonce = (uint32_t *)(work->data + 76);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        be32enc_vect(data, (const uint32_t *)work->data, 19);
+        data[19] = htobe32(*nonce);	
+
+		yescrypt_hash((unsigned char*)data, (unsigned char*)ohash);
+        
+}
+
+
+bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+	unsigned char *pdata, unsigned char __maybe_unused *phash1,
+	unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	
+	while (1)
+	{
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[19] = (n);
+
+		yescrypt_hash((unsigned char*)data, (unsigned char*)ostate);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg))
+		{
+			((uint32_t *)pdata)[19] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart))
+		{
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
\ No newline at end of file
diff --git a/algorithm/yescrypt.h b/algorithm/yescrypt.h
new file mode 100644
index 00000000..b51cb495
--- /dev/null
+++ b/algorithm/yescrypt.h
@@ -0,0 +1,10 @@
+#ifndef YESCRYPT_H
+#define YESCRYPT_H
+
+#include "miner.h"
+#define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar
+#define YESCRYP_SECBUF_SIZE (128*64*8)
+extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void yescrypt_regenhash(struct work *work);
+
+#endif /* YESCRYPT_H */
diff --git a/algorithm/yescrypt_core.h b/algorithm/yescrypt_core.h
new file mode 100644
index 00000000..64b9a11f
--- /dev/null
+++ b/algorithm/yescrypt_core.h
@@ -0,0 +1,376 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _YESCRYPT_H_
+#define _YESCRYPT_H_
+
+#include <stdint.h>
+#include <stdlib.h> /* for size_t */
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output);
+extern void yescrypt_hash(const unsigned char *input, unsigned char *output);
+
+
+
+/**
+ * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen) and write the result into buf.  The parameters r, p, and buflen
+ * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
+ * must be a power of 2 greater than 1.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __salt, size_t __saltlen,
+	uint64_t __N, uint32_t __r, uint32_t __p,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * Internal type used by the memory allocator.  Please do not use it directly.
+ * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
+ * they might differ from each other in a future version.
+ */
+typedef struct {
+	void * base, * aligned;
+	size_t base_size, aligned_size;
+} yescrypt_region_t;
+
+/**
+ * Types for shared (ROM) and thread-local (RAM) data structures.
+ */
+typedef yescrypt_region_t yescrypt_shared1_t;
+typedef struct {
+	yescrypt_shared1_t shared1;
+	uint32_t mask1;
+} yescrypt_shared_t;
+typedef yescrypt_region_t yescrypt_local_t;
+
+/**
+ * Possible values for yescrypt_init_shared()'s flags argument.
+ */
+typedef enum {
+	YESCRYPT_SHARED_DEFAULTS = 0,
+	YESCRYPT_SHARED_PREALLOCATED = 0x100
+} yescrypt_init_shared_flags_t;
+
+/**
+ * Possible values for the flags argument of yescrypt_kdf(),
+ * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
+ * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
+ * Please refer to the description of yescrypt_kdf() below for the meaning of
+ * these flags.
+ */
+typedef enum {
+/* public */
+	YESCRYPT_WORM = 0,
+	YESCRYPT_RW = 1,
+	YESCRYPT_PARALLEL_SMIX = 2,
+	YESCRYPT_PWXFORM = 4,
+/* private */
+	__YESCRYPT_INIT_SHARED_1 = 0x10000,
+	__YESCRYPT_INIT_SHARED_2 = 0x20000,
+	__YESCRYPT_INIT_SHARED = 0x30000
+} yescrypt_flags_t;
+
+#define YESCRYPT_KNOWN_FLAGS \
+	(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
+	__YESCRYPT_INIT_SHARED)
+
+/**
+ * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
+ *     buf, buflen):
+ * Optionally allocate memory for and initialize the shared (ROM) data
+ * structure.  The parameters N, r, and p must satisfy the same conditions as
+ * with crypto_scrypt().  param and paramlen specify a local parameter with
+ * which the ROM is seeded.  If buf is not NULL, then it is used to return
+ * buflen bytes of message digest for the initialized ROM (the caller may use
+ * this to verify that the ROM has been computed in the same way that it was on
+ * a previous run).
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
+ * ROM is assumed to have been preallocated by the caller, with
+ * shared->shared1.aligned being the start address of the ROM and
+ * shared->shared1.aligned_size being its size (which must be consistent with
+ * N, r, and p).  This may be used e.g. when the ROM is to be placed in a SysV
+ * shared memory segment allocated by the caller.
+ *
+ * mask controls the frequency of ROM accesses by yescrypt_kdf().  Normally it
+ * should be set to 1, to interleave RAM and ROM accesses, which works well
+ * when both regions reside in the machine's RAM anyway.  Other values may be
+ * used e.g. when the ROM is memory-mapped from a disk file.  Recommended mask
+ * values are powers of 2 minus 1 or minus 2.  Here's the effect of some mask
+ * values:
+ * mask	value	ROM accesses in SMix 1st loop	ROM accesses in SMix 2nd loop
+ *	0		0				1/2
+ *	1		1/2				1/2
+ *	2		0				1/4
+ *	3		1/4				1/4
+ *	6		0				1/8
+ *	7		1/8				1/8
+ *	14		0				1/16
+ *	15		1/16				1/16
+ *	1022		0				1/1024
+ *	1023		1/1024				1/1024
+ *
+ * Actual computation of the ROM contents may be avoided, if you don't intend
+ * to use a ROM but need a dummy shared structure, by calling this function
+ * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
+ * arguments starting with param and on.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
+	const uint8_t * __param, size_t __paramlen,
+	uint64_t __N, uint32_t __r, uint32_t __p,
+	yescrypt_init_shared_flags_t __flags, uint32_t __mask,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_free_shared(shared):
+ * Free memory that had been allocated with yescrypt_init_shared().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
+
+/**
+ * yescrypt_init_local(local):
+ * Initialize the thread-local (RAM) data structure.  Actual memory allocation
+ * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_init_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_free_local(local):
+ * Free memory that may have been allocated for an initialized thread-local
+ * (RAM) data structure.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_free_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters N, r, p, and buflen must satisfy
+ * the same conditions as with crypto_scrypt().  t controls computation time
+ * while not affecting peak memory usage.  shared and flags may request
+ * special modes as described below.  local is the thread-local data
+ * structure, allowing to preserve and reuse a memory allocation across calls,
+ * thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * t controls computation time.  t = 0 is optimal in terms of achieving the
+ * highest area-time for ASIC attackers.  Thus, higher computation time, if
+ * affordable, is best achieved by increasing N rather than by increasing t.
+ * However, if the higher memory usage (which goes along with higher N) is not
+ * affordable, or if fine-tuning of the time is needed (recall that N must be a
+ * power of 2), then t = 1 or above may be used to increase time while staying
+ * at the same peak memory usage.  t = 1 increases the time by 25% and
+ * decreases the normalized area-time to 96% of optimal.  (Of course, in
+ * absolute terms the area-time increases with higher t.  It's just that it
+ * would increase slightly more with higher N*r rather than with higher t.)
+ * t = 2 increases the time by another 20% and decreases the normalized
+ * area-time to 89% of optimal.  Thus, these two values are reasonable to use
+ * for fine-tuning.  Values of t higher than 2 result in further increase in
+ * time while reducing the efficiency much further (e.g., down to around 50% of
+ * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
+ * numbers varying by the flags settings).
+ *
+ * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
+ * passing a dummy shared structure (see the description of
+ * yescrypt_init_shared() above for how to produce one).  In this mode, the
+ * thread-local memory region (RAM) is first sequentially written to and then
+ * randomly read from.  This algorithm is friendly towards time-memory
+ * tradeoffs (TMTO), available both to defenders (albeit not in this
+ * implementation) and to attackers.
+ *
+ * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
+ * memory region (RAM), which makes TMTO a lot less efficient.  This may be
+ * used to slow down the kinds of attackers who would otherwise benefit from
+ * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
+ * only for the tradeoff, but also for a decrease of attacker's area-time (by
+ * up to a constant factor), setting YESCRYPT_RW substantially increases the
+ * cost of attacks in area-time terms as well.  Yet another benefit of it is
+ * that optimal area-time is reached at an earlier time than with classic
+ * scrypt, and t = 0 actually corresponds to this earlier completion time,
+ * resulting in quicker hash computations (and thus in higher request rate
+ * capacity).  Due to these properties, YESCRYPT_RW should almost always be
+ * set, except when compatibility with classic scrypt or TMTO-friendliness are
+ * desired.
+ *
+ * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
+ * lower level as compared to where it is in classic scrypt.  This reduces
+ * flexibility for efficient computation (for both attackers and defenders) by
+ * requiring that, short of resorting to TMTO, the full amount of memory be
+ * allocated as needed for the specified p, regardless of whether that
+ * parallelism is actually being fully made use of or not.  (For comparison, a
+ * single instance of classic scrypt may be computed in less memory without any
+ * CPU time overhead, but in more real time, by not making full use of the
+ * parallelism.)  This may be desirable when the defender has enough memory
+ * with sufficiently low latency and high bandwidth for efficient full parallel
+ * execution, yet the required memory size is high enough that some likely
+ * attackers might end up being forced to choose between using higher latency
+ * memory than they could use otherwise (waiting for data longer) or using TMTO
+ * (waiting for data more times per one hash computation).  The area-time cost
+ * for other kinds of attackers (who would use the same memory type and TMTO
+ * factor or no TMTO either way) remains roughly the same, given the same
+ * running time for the defender.  In the TMTO-friendly YESCRYPT_WORM mode, as
+ * long as the defender has enough memory that is just as fast as the smaller
+ * per-thread regions would be, doesn't expect to ever need greater
+ * flexibility (except possibly via TMTO), and doesn't need backwards
+ * compatibility with classic scrypt, there are no other serious drawbacks to
+ * this setting.  In the YESCRYPT_RW mode, which is meant to discourage TMTO,
+ * this new approach to parallelization makes TMTO less inefficient.  (This is
+ * an unfortunate side-effect of avoiding some random writes, as we have to in
+ * order to allow for parallel threads to access a common memory region without
+ * synchronization overhead.)  Thus, in this mode this setting poses an extra
+ * tradeoff of its own (higher area-time cost for a subset of attackers vs.
+ * better TMTO resistance).  Setting YESCRYPT_PARALLEL_SMIX also changes the
+ * way the running time is to be controlled from N*r*p (for classic scrypt) to
+ * N*r (in this modification).  All of this applies only when p > 1.  For
+ * p = 1, this setting is a no-op.
+ *
+ * Passing a real shared structure, with ROM contents previously computed by
+ * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
+ * the thread-local RAM region.  In order to allow for initialization of the
+ * ROM to be split into a separate program, the shared->shared1.aligned and
+ * shared->shared1.aligned_size fields may be set by the caller of
+ * yescrypt_kdf() manually rather than with yescrypt_init_shared().
+ *
+ * local must be initialized with yescrypt_init_local().
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
+	yescrypt_local_t * __local,
+	const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __salt, size_t __saltlen,
+	uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
+	yescrypt_flags_t __flags,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  If the shared structure is
+ * not dummy, a ROM is used and YESCRYPT_RW is required.  Otherwise, whether to
+ * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
+ * discouraging modification) is determined by the setting string.  shared and
+ * local must be initialized as described above for yescrypt_kdf().  buf must
+ * be large enough (as indicated by buflen) to hold the encoded hash string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
+	yescrypt_local_t * __local,
+	const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __setting,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt(passwd, setting):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  Whether to use the
+ * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
+ * discouraging modification) is determined by the setting string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * This is a crypt(3)-like interface, which is simpler to use than
+ * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
+ * and it is slower than yescrypt_r() for repeated calls because it allocates
+ * and frees memory on each call.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting);
+
+/**
+ * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt() by
+ * encoding into it the parameters N_log2 (which is to be set to base 2
+ * logarithm of the desired value for N), r, p, flags, and a salt given by src
+ * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
+ * hold the setting string.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern uint8_t * yescrypt_gensalt_r(
+	uint32_t __N_log2, uint32_t __r, uint32_t __p,
+	yescrypt_flags_t __flags,
+	const uint8_t * __src, size_t __srclen,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt().  This
+ * function is the same as yescrypt_gensalt_r() except that it uses a static
+ * buffer and thus is not MT-safe.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt_gensalt(
+	uint32_t __N_log2, uint32_t __r, uint32_t __p,
+	yescrypt_flags_t __flags,
+	const uint8_t * __src, size_t __srclen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_YESCRYPT_H_ */
diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c
new file mode 100644
index 00000000..cf7067d0
--- /dev/null
+++ b/algorithm/yescryptcommon.c
@@ -0,0 +1,360 @@
+/*-
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algorithm/yescrypt_core.h"
+
+#define BYTES2CHARS(bytes) \
+	((((bytes) * 8) + 5) / 6)
+
+#define HASH_SIZE 32 /* bytes */
+#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
+#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
+static const char * const itoa64 =
+	"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen,
+    uint32_t src, uint32_t srcbits)
+{
+	uint32_t bit;
+
+	for (bit = 0; bit < srcbits; bit += 6) {
+		if (dstlen < 1)
+			return NULL;
+		*dst++ = itoa64[src & 0x3f];
+		dstlen--;
+		src >>= 6;
+	}
+
+	return dst;
+}
+
+static uint8_t * encode64(uint8_t * dst, size_t dstlen,
+    const uint8_t * src, size_t srclen)
+{
+	size_t i;
+
+	for (i = 0; i < srclen; ) {
+		uint8_t * dnext;
+		uint32_t value = 0, bits = 0;
+		do {
+			value |= (uint32_t)src[i++] << bits;
+			bits += 8;
+		} while (bits < 24 && i < srclen);
+		dnext = encode64_uint32(dst, dstlen, value, bits);
+		if (!dnext)
+			return NULL;
+		dstlen -= dnext - dst;
+		dst = dnext;
+	}
+
+	return dst;
+}
+
+static int decode64_one(uint32_t * dst, uint8_t src)
+{
+	const char * ptr = strchr(itoa64, src);
+	if (ptr) {
+		*dst = ptr - itoa64;
+		return 0;
+	}
+	*dst = 0;
+	return -1;
+}
+
+static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits,
+    const uint8_t * src)
+{
+	uint32_t bit;
+	uint32_t value;
+
+	value = 0;
+	for (bit = 0; bit < dstbits; bit += 6) {
+		uint32_t one;
+		if (decode64_one(&one, *src)) {
+			*dst = 0;
+			return NULL;
+		}
+		src++;
+		value |= one << bit;
+	}
+
+	*dst = value;
+	return src;
+}
+
+uint8_t *
+yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * setting,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t hash[HASH_SIZE];
+	const uint8_t * src, * salt;
+	uint8_t * dst;
+	size_t prefixlen, saltlen, need;
+	uint8_t version;
+	uint64_t N;
+	uint32_t r, p;
+	yescrypt_flags_t flags = YESCRYPT_WORM;
+          fflush(stdout);
+	if (setting[0] != '$' || setting[1] != '7')
+		{
+            fflush(stdout);
+		return NULL; 
+        }
+          fflush(stdout);  
+	src = setting + 2;
+          fflush(stdout); 
+	switch ((version = *src)) {
+	case '$':
+                fflush(stdout);
+		break;
+	case 'X':
+		src++;
+		flags = YESCRYPT_RW;
+                fflush(stdout);
+		break;
+	default:
+		{
+            fflush(stdout);
+		return NULL; 
+        }
+	}
+
+          fflush(stdout);
+	if (*src != '$') {
+		uint32_t decoded_flags;
+		if (decode64_one(&decoded_flags, *src))
+
+		{
+		fflush(stdout);
+		return NULL; 
+		}
+		flags = decoded_flags;
+		if (*++src != '$')
+	    {	
+        fflush(stdout);
+		return NULL; 
+        }
+	}
+	src++;
+
+	{
+		uint32_t N_log2;
+		if (decode64_one(&N_log2, *src))
+			{
+		         return NULL; 
+            }
+		src++;
+		N = (uint64_t)1 << N_log2;
+	}
+
+	src = decode64_uint32(&r, 30, src);
+	if (!src)
+          {
+		return NULL; 
+          }
+
+	src = decode64_uint32(&p, 30, src);
+	if (!src)
+         {
+		return NULL; 
+         }
+
+	prefixlen = src - setting;
+
+	salt = src;
+	src = (uint8_t *)strrchr((char *)salt, '$');
+	if (src)
+		saltlen = src - salt;
+	else
+		saltlen = strlen((char *)salt);
+
+	need = prefixlen + saltlen + 1 + HASH_LEN + 1;
+	if (need > buflen || need < saltlen)
+                            
+		 {
+        fflush(stdout);
+		return NULL; 
+         }
+
+fflush(stdout);
+	if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+	    N, r, p, 0, flags, hash, sizeof(hash)))
+	    {
+        fflush(stdout);
+		return NULL; 
+        }
+
+	dst = buf;
+	memcpy(dst, setting, prefixlen + saltlen);
+	dst += prefixlen + saltlen;
+	*dst++ = '$';
+
+	dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
+	/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
+	 * memory allocations yet anyway. */
+	if (!dst || dst >= buf + buflen) /* Can't happen */
+		{
+		return NULL; 
+        }
+
+	*dst = 0; /* NUL termination */
+           fflush(stdout);
+	return buf;
+}
+
+uint8_t *
+yescrypt(const uint8_t * passwd, const uint8_t * setting)
+{
+	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
+	yescrypt_shared_t shared;
+	yescrypt_local_t local;
+	uint8_t * retval;
+	if (yescrypt_init_shared(&shared, NULL, 0,
+	    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
+		return NULL;
+	if (yescrypt_init_local(&local)) {
+		yescrypt_free_shared(&shared);
+		return NULL;
+	}
+	retval = yescrypt_r(&shared, &local,
+	    passwd, 80, setting, buf, sizeof(buf));
+        // printf("hashse='%s'\n", (char *)retval);
+	if (yescrypt_free_local(&local)) {
+		yescrypt_free_shared(&shared);
+		return NULL;
+	}
+	if (yescrypt_free_shared(&shared))
+		return NULL;
+	return retval;
+      
+}
+
+uint8_t *
+yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p,
+    yescrypt_flags_t flags,
+    const uint8_t * src, size_t srclen,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t * dst;
+	size_t prefixlen = 3 + 1 + 5 + 5;
+	size_t saltlen = BYTES2CHARS(srclen);
+	size_t need;
+
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	if (flags) {
+		if (flags & ~0x3f)
+			return NULL;
+
+		prefixlen++;
+		if (flags != YESCRYPT_RW)
+			prefixlen++;
+	}
+
+	need = prefixlen + saltlen + 1;
+	if (need > buflen || need < saltlen || saltlen < srclen)
+		return NULL;
+
+	if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
+		return NULL;
+
+	dst = buf;
+	*dst++ = '$';
+	*dst++ = '7';
+	if (flags) {
+		*dst++ = 'X'; /* eXperimental, subject to change */
+		if (flags != YESCRYPT_RW)
+			*dst++ = itoa64[flags];
+	}
+	*dst++ = '$';
+
+	*dst++ = itoa64[N_log2];
+
+	dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
+	if (!dst) /* Can't happen */
+		return NULL;
+
+	dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
+	if (!dst) /* Can't happen */
+		return NULL;
+
+	dst = encode64(dst, buflen - (dst - buf), src, srclen);
+	if (!dst || dst >= buf + buflen) /* Can't happen */
+		return NULL;
+
+	*dst = 0; /* NUL termination */
+
+	return buf;
+}
+
+uint8_t *
+yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p,
+    yescrypt_flags_t flags,
+    const uint8_t * src, size_t srclen)
+{
+	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
+	return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
+	    buf, sizeof(buf));
+}
+
+static int
+yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
+    uint8_t * buf, size_t buflen)
+{
+	static __thread int initialized = 0;
+	static __thread yescrypt_shared_t shared;
+	static __thread yescrypt_local_t local;
+
+//		static __declspec(thread) int initialized = 0;
+//		static __declspec(thread) yescrypt_shared_t shared;
+//		static  __declspec(thread) yescrypt_local_t local;
+
+	int retval;
+	if (!initialized) {
+/* "shared" could in fact be shared, but it's simpler to keep it private
+ * along with "local".  It's dummy and tiny anyway. */
+		if (yescrypt_init_shared(&shared, NULL, 0,
+		    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
+			return -1;
+		if (yescrypt_init_local(&local)) {
+			yescrypt_free_shared(&shared);
+			return -1;
+		}
+		initialized = 1;
+ 	}
+	retval = yescrypt_kdf(&shared, &local,
+	    passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
+	    buf, buflen);		
+
+	return retval;
+}
+
+void yescrypt_hash(const unsigned char *input, unsigned char *output)
+{
+
+   yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32);
+}
diff --git a/driver-opencl.c b/driver-opencl.c
index 72ee9559..f0a88a6f 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -257,14 +257,14 @@ char *set_gpu_threads(const char *_arg)
   if (nextptr == NULL)
     return "Invalid parameters for set_gpu_threads";
   val = atoi(nextptr);
-  if (val < 1 || val > 10)
+  if (val < 1 || val > 20) // gpu_threads increase max value to 20
     return "Invalid value passed to set_gpu_threads";
 
   gpus[device++].threads = val;
 
   while ((nextptr = strtok(NULL, ",")) != NULL) {
     val = atoi(nextptr);
-    if (val < 1 || val > 10)
+    if (val < 1 || val > 20) // gpu_threads increase max value to 20
       return "Invalid value passed to set_gpu_threads";
 
     gpus[device++].threads = val;
@@ -1472,6 +1472,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
     }
     applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
     postcalc_hash_async(thr, work, thrdata->res);
+//	postcalc_hash(thr);
+//	submit_tested_work(thr, work);
+//	submit_work_async(work);
     memset(thrdata->res, 0, buffersize);
     /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
     clFinish(clState->commandQueue);
@@ -1493,6 +1496,12 @@ static void opencl_thread_shutdown(struct thr_info *thr)
     clFinish(clState->commandQueue);
     clReleaseMemObject(clState->outputBuffer);
     clReleaseMemObject(clState->CLbuffer0);
+	if (clState->buffer1)
+	clReleaseMemObject(clState->buffer1);
+	if (clState->buffer2)
+	clReleaseMemObject(clState->buffer2);
+	if (clState->buffer3)
+	clReleaseMemObject(clState->buffer3);
     if (clState->padbuffer8)
       clReleaseMemObject(clState->padbuffer8);
     clReleaseKernel(clState->kernel);
diff --git a/findnonce.c b/findnonce.c
index c2402997..8858cfa6 100644
--- a/findnonce.c
+++ b/findnonce.c
@@ -214,6 +214,7 @@ static void *postcalc_hash(void *userdata)
 
 void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
 {
+	
   struct pc_data *pcd = (struct pc_data *)malloc(sizeof(struct pc_data));
   int buffersize;
 
@@ -225,8 +226,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
   pcd->thr = thr;
   pcd->work = copy_work(work);
   buffersize = BUFFERSIZE;
-
-  memcpy(&pcd->res, res, buffersize);
+   memcpy(&pcd->res, res, buffersize);
 
   if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) {
     applog(LOG_ERR, "Failed to create postcalc_hash thread");
@@ -366,4 +366,4 @@ void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
   blk->cty_a = data[16];
   blk->cty_b = data[17];
   blk->cty_c = data[18];
-}
\ No newline at end of file
+}
diff --git a/kernel/bmw256.cl b/kernel/bmw256.cl
new file mode 100644
index 00000000..19c85cbc
--- /dev/null
+++ b/kernel/bmw256.cl
@@ -0,0 +1,162 @@
+/*
+* bmw256 kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+* Copyright (c) 2015 djm34
+* 
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+
+
+
+#define shl(x, n)            ((x) << (n))
+#define shr(x, n)            ((x) >> (n))
+//#define SHR(x, n) SHR2(x, n) 
+//#define SHL(x, n) SHL2(x, n) 
+
+
+#define SPH_ROTL32(x,n) rotate(x,(uint)n)
+#define ss0(x)  (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss1(x)  (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
+#define ss2(x)  (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss3(x)  (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+#define ss4(x)  (shr((x), 1) ^ (x))
+#define ss5(x)  (shr((x), 2) ^ (x))
+#define rs1(x) SPH_ROTL32((x),  3)
+#define rs2(x) SPH_ROTL32((x),  7)
+#define rs3(x) SPH_ROTL32((x), 13)
+#define rs4(x) SPH_ROTL32((x), 16)
+#define rs5(x) SPH_ROTL32((x), 19)
+#define rs6(x) SPH_ROTL32((x), 23)
+#define rs7(x) SPH_ROTL32((x), 27)
+
+/* Message expansion function 1 */
+uint expand32_1(int i, uint *M32, uint *H, uint *Q)
+{
+
+	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+		+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+		+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+		+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+
+}
+
+/* Message expansion function 2 */
+uint expand32_2(int i, uint *M32, uint *H, uint *Q)
+{
+
+	return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+		+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+		+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+		+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+
+}
+
+void Compression256(uint *M32, uint *H)
+{
+
+	int i;
+	uint XL32, XH32, Q[32];
+
+
+	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
+
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[0] = ss0(Q[0]) + H[1];
+	Q[1] = ss1(Q[1]) + H[2];
+	Q[2] = ss2(Q[2]) + H[3];
+	Q[3] = ss3(Q[3]) + H[4];
+	Q[4] = ss4(Q[4]) + H[5];
+	Q[5] = ss0(Q[5]) + H[6];
+	Q[6] = ss1(Q[6]) + H[7];
+	Q[7] = ss2(Q[7]) + H[8];
+	Q[8] = ss3(Q[8]) + H[9];
+	Q[9] = ss4(Q[9]) + H[10];
+	Q[10] = ss0(Q[10]) + H[11];
+	Q[11] = ss1(Q[11]) + H[12];
+	Q[12] = ss2(Q[12]) + H[13];
+	Q[13] = ss3(Q[13]) + H[14];
+	Q[14] = ss4(Q[14]) + H[15];
+	Q[15] = ss0(Q[15]) + H[0];
+
+	/* This is the Message expansion or f_1 in the documentation.       */
+	/* It has 16 rounds.                                                */
+	/* Blue Midnight Wish has two tunable security parameters.          */
+	/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS.    */
+	/* The following relation for these parameters should is satisfied: */
+	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
+#pragma unroll
+	for (i = 0; i<2; i++)
+		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+
+#pragma unroll
+	for (i = 2; i<16; i++)
+		Q[i + 16] = expand32_2(i + 16, M32, H, Q);
+
+	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
+	/* 16 new variables that are prooduced in the Message Expansion part.                    */
+	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
+	XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+
+	/*  This part is the function f_2 - in the documentation            */
+
+	/*  Compute the double chaining pipe for the next message block.    */
+	H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32    ^ Q[24] ^ Q[0]);
+	H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32    ^ Q[25] ^ Q[1]);
+	H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
+	H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
+	H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32    ^ Q[28] ^ Q[4]);
+	H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32    ^ Q[29] ^ Q[5]);
+	H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32    ^ Q[30] ^ Q[6]);
+	H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32    ^ Q[31] ^ Q[7]);
+
+	H[8] = SPH_ROTL32(H[4], 9) + (XH32     ^     Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
+	H[9] = SPH_ROTL32(H[5], 10) + (XH32     ^     Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
+	H[10] = SPH_ROTL32(H[6], 11) + (XH32     ^     Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
+	H[11] = SPH_ROTL32(H[7], 12) + (XH32     ^     Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
+	H[12] = SPH_ROTL32(H[0], 13) + (XH32     ^     Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
+	H[13] = SPH_ROTL32(H[1], 14) + (XH32     ^     Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
+	H[14] = SPH_ROTL32(H[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	H[15] = SPH_ROTL32(H[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+
+}
diff --git a/kernel/credits.cl b/kernel/credits.cl
new file mode 100644
index 00000000..19cbea67
--- /dev/null
+++ b/kernel/credits.cl
@@ -0,0 +1,232 @@
+/*
+* "credits" kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+*
+* Copyright (c) 2015  djm34
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+#if !defined(cl_khr_byte_addressable_store)
+#error "Device does not support unaligned stores"
+#endif
+
+
+#define ROL32(x, n)  rotate(x, (uint) n)
+#define SWAP32(a)    (as_uint(as_uchar4(a).wzyx))
+#define SWAP64(x) as_ulong(as_uchar8(x).s32107654)  /// hmm...
+
+#define SHR(x, n)    ((x) >> n)
+
+#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^  SHR(x, 3))
+#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^  SHR(x, 10))
+
+#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10))
+#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7))
+
+#define P(a,b,c,d,e,f,g,h,x,K)                  \
+{                                               \
+    temp1 = h + S3(e) + F1(e,f,g) + (K + x);      \
+    d += temp1; h = temp1 + S2(a) + F0(a,b,c);  \
+}
+
+#define F0(y, x, z) bitselect(z, y, z ^ x)
+#define F1(x, y, z) bitselect(z, y, x)
+
+#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0)
+#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1)
+#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2)
+#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3)
+#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4)
+#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5)
+#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6)
+#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7)
+#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8)
+#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9)
+#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10)
+#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11)
+#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12)
+#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13)
+#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14)
+#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15)
+
+#define RD14 (S1(W12) + W7 + S0(W15) + W14)
+#define RD15 (S1(W13) + W8 + S0(W0) + W15)
+
+/// generic sha transform
+inline uint8 sha256_Transform(uint16 data, uint8 state)
+{
+	uint temp1;
+	uint8 res = state;
+	uint W0 = data.s0;
+	uint W1 = data.s1;
+	uint W2 = data.s2;
+	uint W3 = data.s3;
+	uint W4 = data.s4;
+	uint W5 = data.s5;
+	uint W6 = data.s6;
+	uint W7 = data.s7;
+	uint W8 = data.s8;
+	uint W9 = data.s9;
+	uint W10 = data.sA;
+	uint W11 = data.sB;
+	uint W12 = data.sC;
+	uint W13 = data.sD;
+	uint W14 = data.sE;
+	uint W15 = data.sF;
+
+#define v0  res.s0
+#define v1  res.s1
+#define v2  res.s2
+#define v3  res.s3
+#define v4  res.s4
+#define v5  res.s5
+#define v6  res.s6
+#define v7  res.s7
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef v4
+#undef v5
+#undef v6
+#undef v7
+	return (res + state);
+}
+
+
+
+static __constant  uint8 H256 = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372,
+	0xA54FF53A, 0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+
+static __constant uint8 pad_data =
+{
+	0x00000000, 0x00000000, 0x80000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000540
+};
+
+static __constant uint8 pad_state =
+{
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint* restrict output,const ulong target, uint8 midstate )
+{
+
+
+	uint nonce = get_global_id(0);
+	uint16 in;
+	uint8 state1;
+
+    in.lo = ((__global const uint8 *)input)[4];
+    in.hi = pad_data;	
+	in.hi.s0 = ((__global const uint *)input)[40];
+	in.hi.s1 = ((__global const uint *)input)[41];
+    in.s3 = nonce;
+	state1 = sha256_Transform(in, midstate);
+    in.lo = state1;
+    in.hi = pad_state; 
+	state1 = sha256_Transform(in,H256);
+
+if (SWAP64(state1.s67) <= target)  
+		output[atomic_inc(output + 0xFF)] = nonce;
+
+}
+
diff --git a/kernel/cubehash256.cl b/kernel/cubehash256.cl
new file mode 100644
index 00000000..9bc4c654
--- /dev/null
+++ b/kernel/cubehash256.cl
@@ -0,0 +1,132 @@
+// cubehash256
+// djm34 2015 based on ccminer cubehash512
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+
+#define LROT(x, bits) rotate( x,(uint) bits)
+
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+#define SWAP(a,b) { uint u = a; a = b; b = u; }
+
+inline void rrounds(uint x[2][2][2][2][2])
+{
+	int r;
+	int j;
+	int k;
+	int l;
+	int m;
+
+//#pragma unroll 2
+	for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
+
+		/* "add x_0jklm into x_1jklmn modulo 2^32" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 7 bits" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+		/* "swap x_00klm with x_01klm" */
+//#pragma unroll 2
+		for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+//#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jk0m with x_1jk1m" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
+
+					/* "add x_0jklm into x_1jklm modulo 2^32" */
+//#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 11 bits" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+		/* "swap x_0j0lm with x_0j1lm" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+//#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+//#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jkl0 with x_1jkl1" */
+//#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+//#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+//#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+
+	}
+}
+
+
diff --git a/kernel/lyra2rev2.cl b/kernel/lyra2rev2.cl
new file mode 100644
index 00000000..0fe0440d
--- /dev/null
+++ b/kernel/lyra2rev2.cl
@@ -0,0 +1,525 @@
+/*
+ * Lyra2RE kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2014 djm34
+ * Copyright (c) 2014 James Lovejoy
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   djm34
+ */
+// typedef unsigned int uint;
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#ifndef LYRA2RE_CL
+#define LYRA2RE_CL
+
+#if __ENDIAN_LITTLE__
+#define SPH_LITTLE_ENDIAN 1
+#else
+#define SPH_BIG_ENDIAN 1
+#endif
+
+#define SPH_UPTR sph_u64
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+#ifndef __OPENCL_VERSION__
+typedef unsigned long sph_u64;
+typedef long  sph_s64;
+#else
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+#endif
+
+
+#define SPH_64 1
+#define SPH_64_TRUE 1
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+
+//#define SPH_ROTL32(x, n)   (((x) << (n)) | ((x) >> (32 - (n))))
+//#define SPH_ROTR32(x, n)   (((x) >> (n)) | ((x) << (32 - (n))))
+//#define SPH_ROTL64(x, n)   (((x) << (n)) | ((x) >> (64 - (n))))
+//#define SPH_ROTR64(x, n)   (((x) >> (n)) | ((x) << (64 - (n))))
+
+#define SPH_ROTL32(x,n) rotate(x,(uint)n)     //faster with driver 14.6
+#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
+#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
+#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
+static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
+	uint2 result;
+	uint2 v = as_uint2(vw);
+	unsigned n = (unsigned)(64 - a);
+	if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
+	if (n < 32) {
+		result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+		result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+	}
+	else {
+		result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+		result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+	}
+	return as_ulong(result);
+}
+
+//#define SPH_ROTR64(l,n) ror64(l,n)
+#define memshift 3
+#include "blake256.cl"
+#include "lyra2v2.cl"
+#include "keccak1600.cl"
+#include "skein256.cl"
+#include "cubehash.cl"
+#include "bmw256.cl"
+
+#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
+#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
+//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654)
+#if SPH_BIG_ENDIAN
+  #define DEC64E(x) (x)
+  #define DEC64BE(x) (*(const __global sph_u64 *) (x));
+  #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x));
+  #define DEC32LE(x) (*(const __global sph_u32 *) (x));
+#else
+  #define DEC64E(x) SWAP8(x)
+  #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
+  #define DEC64LE(x) (*(const __global sph_u64 *) (x));
+#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x));
+#endif
+
+typedef union {
+  unsigned char h1[32];
+  uint h4[8];
+  ulong h8[4];
+} hash_t;
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	 __global uchar* hashes,
+	// precalc hash from fisrt part of message
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+	// last 12 bytes of original message
+	const uint in16,
+	const uint in17,
+	const uint in18
+)
+{
+ uint gid = get_global_id(0);
+ __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+//  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+    unsigned int h[8];
+	unsigned int m[16];
+	unsigned int v[16];
+
+
+h[0]=h0;
+h[1]=h1;
+h[2]=h2;
+h[3]=h3;
+h[4]=h4;
+h[5]=h5;
+h[6]=h6;
+h[7]=h7;
+// compress 2nd round
+ m[0] = in16;
+ m[1] = in17;
+ m[2] = in18;
+ m[3] = SWAP4(gid);
+
+	for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];}
+
+	for (int i = 0; i < 8; i++) {v[i] = h[i];}
+
+	v[8] =  c_u256[0];
+	v[9] =  c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+	v[12] = c_u256[4] ^ 640;
+	v[13] = c_u256[5] ^ 640;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	for (int r = 0; r < 14; r++) {	
+		GS(0, 4, 0x8, 0xC, 0x0);
+		GS(1, 5, 0x9, 0xD, 0x2);
+		GS(2, 6, 0xA, 0xE, 0x4);
+		GS(3, 7, 0xB, 0xF, 0x6);
+		GS(0, 5, 0xA, 0xF, 0x8);
+		GS(1, 6, 0xB, 0xC, 0xA);
+		GS(2, 7, 0x8, 0xD, 0xC);
+		GS(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	for (int i = 0; i < 16; i++) {
+		 int j = i & 7;
+		h[j] ^= v[i];}
+
+for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);}
+
+barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+// keccak256
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search1(__global uchar* hashes)
+{
+  uint gid = get_global_id(0);
+ // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+  __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+ 		sph_u64 keccak_gpu_state[25];
+
+		for (int i = 0; i<25; i++) {
+			if (i<4) { keccak_gpu_state[i] = hash->h8[i]; }
+			else    { keccak_gpu_state[i] = 0; }
+		}
+		keccak_gpu_state[4] = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;
+
+		keccak_block(keccak_gpu_state);
+		for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; }
+barrier(CLK_LOCAL_MEM_FENCE);
+
+
+
+}
+
+// cubehash256
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search2(__global uchar* hashes)
+{
+	uint gid = get_global_id(0);
+	__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+	sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
+	sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
+	sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
+	sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
+	sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
+	sph_u32 xf = 0x40E5FBAB;
+
+	sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
+	sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
+	sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
+	sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
+	sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
+	sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
+
+	x0 ^= (hash->h4[0]);
+	x1 ^= (hash->h4[1]);
+	x2 ^= (hash->h4[2]);
+	x3 ^= (hash->h4[3]);
+	x4 ^= (hash->h4[4]);
+	x5 ^= (hash->h4[5]);
+	x6 ^= (hash->h4[6]);
+	x7 ^= (hash->h4[7]);
+
+
+	SIXTEEN_ROUNDS;
+	x0 ^= 0x80;
+	SIXTEEN_ROUNDS;
+	xv ^= 0x01;
+	for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
+
+	hash->h4[0] = x0;
+	hash->h4[1] = x1;
+	hash->h4[2] = x2;
+	hash->h4[3] = x3;
+	hash->h4[4] = x4;
+	hash->h4[5] = x5;
+	hash->h4[6] = x6;
+	hash->h4[7] = x7;
+
+
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+}
+
+
+/// lyra2 algo 
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search3(__global uchar* hashes,__global uchar* matrix )
+{
+ uint gid = get_global_id(0);
+ // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+  __global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * 8 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+//  uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32;
+  ulong4 state[4];
+  
+  state[0].x = hash->h8[0]; //password
+  state[0].y = hash->h8[1]; //password
+  state[0].z = hash->h8[2]; //password
+  state[0].w = hash->h8[3]; //password
+  state[1] = state[0];
+  state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL);
+  state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL);
+  for (int i = 0; i<12; i++) { round_lyra(state); } 
+
+  state[0] ^= (ulong4)(0x20,0x20,0x20,0x01);
+  state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000);
+
+  for (int i = 0; i<12; i++) { round_lyra(state); } 
+
+
+  uint ps1 = (memshift * 3);
+//#pragma unroll 4
+  for (int i = 0; i < 4; i++)
+  {
+	  uint s1 = ps1 - memshift * i;
+	  for (int j = 0; j < 3; j++)
+		  (DMatrix)[j+s1] = state[j];
+
+	  round_lyra(state);
+  }
+ 
+  reduceDuplexf(state,DMatrix);
+ 
+  reduceDuplexRowSetupf(1, 0, 2,state, DMatrix);
+  reduceDuplexRowSetupf(2, 1, 3, state,DMatrix);
+
+
+  uint rowa;
+  uint prev = 3;
+  for (uint i = 0; i<4; i++) {
+	  rowa = state[0].x & 3;
+	  reduceDuplexRowf(prev, rowa, i, state, DMatrix);
+	  prev = i;
+  }
+
+
+
+  uint shift = (memshift * 4 * rowa);
+
+  for (int j = 0; j < 3; j++)
+	  state[j] ^= (DMatrix)[j+shift];
+
+  for (int i = 0; i < 12; i++)
+	  round_lyra(state);
+//////////////////////////////////////
+
+
+  for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];} 
+barrier(CLK_LOCAL_MEM_FENCE);
+
+ 
+
+}
+
+//skein256
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search4(__global uchar* hashes)
+{
+ uint gid = get_global_id(0);
+ // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+  __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+		sph_u64 h[9];
+		sph_u64 t[3];
+        sph_u64 dt0,dt1,dt2,dt3;
+		sph_u64 p0, p1, p2, p3, p4, p5, p6, p7;
+        h[8] = skein_ks_parity;
+
+		for (int i = 0; i<8; i++) {
+			h[i] = SKEIN_IV512_256[i];
+			h[8] ^= h[i];}
+		    
+			t[0]=t12[0];
+			t[1]=t12[1];
+			t[2]=t12[2];
+
+        dt0=hash->h8[0];
+        dt1=hash->h8[1];
+        dt2=hash->h8[2];
+        dt3=hash->h8[3];
+
+		p0 = h[0] + dt0;
+		p1 = h[1] + dt1;
+		p2 = h[2] + dt2;
+		p3 = h[3] + dt3;
+		p4 = h[4];
+		p5 = h[5] + t[0];
+		p6 = h[6] + t[1];
+		p7 = h[7];
+
+        #pragma unroll 
+		for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);}
+        p0 ^= dt0;
+        p1 ^= dt1;
+        p2 ^= dt2;
+        p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity;
+        
+		for (int i = 0; i<8; i++) { h[8] ^= h[i]; }
+		
+		t[0] = t12[3];
+		t[1] = t12[4];
+		t[2] = t12[5];
+		p5 += t[0];  //p5 already equal h[5] 
+		p6 += t[1];
+       
+        #pragma unroll
+		for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); }
+
+		hash->h8[0]      = p0;
+		hash->h8[1]      = p1;
+		hash->h8[2]      = p2;
+		hash->h8[3]      = p3;
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+//cubehash
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search5(__global uchar* hashes)
+{
+	uint gid = get_global_id(0);
+	__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+	    sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
+	    sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
+		sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
+		sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
+		sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
+		sph_u32 xf = 0x40E5FBAB;
+
+		sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
+		sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
+		sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
+		sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
+		sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
+		sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
+		
+	x0 ^= (hash->h4[0]);
+	x1 ^= (hash->h4[1]);
+	x2 ^= (hash->h4[2]);
+	x3 ^= (hash->h4[3]);
+	x4 ^= (hash->h4[4]);
+	x5 ^= (hash->h4[5]);
+	x6 ^= (hash->h4[6]);
+	x7 ^= (hash->h4[7]);
+
+
+		SIXTEEN_ROUNDS;
+			x0 ^= 0x80;
+		SIXTEEN_ROUNDS;
+			xv ^= 0x01;
+			for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
+
+	hash->h4[0] = x0;
+	hash->h4[1] = x1;
+	hash->h4[2] = x2;
+	hash->h4[3] = x3;
+	hash->h4[4] = x4;
+	hash->h4[5] = x5;
+	hash->h4[6] = x6;
+	hash->h4[7] = x7;
+
+
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+}
+
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target)
+{
+	uint gid = get_global_id(0);
+	__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+	uint dh[16] = {
+		0x40414243, 0x44454647,
+		0x48494A4B, 0x4C4D4E4F,
+		0x50515253, 0x54555657,
+		0x58595A5B, 0x5C5D5E5F,
+		0x60616263, 0x64656667,
+		0x68696A6B, 0x6C6D6E6F,
+		0x70717273, 0x74757677,
+		0x78797A7B, 0x7C7D7E7F
+	};
+	uint final_s[16] = {
+		0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2,
+		0xaaaaaaa3, 0xaaaaaaa4, 0xaaaaaaa5,
+		0xaaaaaaa6, 0xaaaaaaa7, 0xaaaaaaa8,
+		0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
+		0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae,
+		0xaaaaaaaf
+	};
+
+	uint message[16];
+	for (int i = 0; i<8; i++) message[i] = hash->h4[i];
+	for (int i = 9; i<14; i++) message[i] = 0;
+	message[8]= 0x80;
+	message[14]=0x100;
+	message[15]=0;
+
+	Compression256(message, dh);
+	Compression256(dh, final_s);
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+
+	bool result = ( ((ulong*)final_s)[7] <= target);
+	if (result) {
+		output[atomic_inc(output + 0xFF)] = SWAP4(gid);
+	}
+
+}
+
+
+#endif // LYRA2RE_CL
\ No newline at end of file
diff --git a/kernel/lyra2v2.cl b/kernel/lyra2v2.cl
new file mode 100644
index 00000000..be0f1f28
--- /dev/null
+++ b/kernel/lyra2v2.cl
@@ -0,0 +1,184 @@
+/*
+* Lyra2 kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+* Copyright (c) 2014 djm34
+*
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+
+
+
+#define ROTL64(x,n) rotate(x,(ulong)n)
+#define ROTR64(x,n) rotate(x,(ulong)(64-n))
+#define SWAP32(x) as_ulong(as_uint2(x).s10)
+#define SWAP24(x) as_ulong(as_uchar8(x).s34567012)
+#define SWAP16(x) as_ulong(as_uchar8(x).s23456701)
+
+#define G(a,b,c,d) \
+  do { \
+	a += b; d ^= a; d = SWAP32(d); \
+	c += d; b ^= c; b = ROTR64(b,24); \
+	a += b; d ^= a; d = ROTR64(d,16); \
+	c += d; b ^= c; b = ROTR64(b, 63); \
+\
+  } while (0)
+
+#define G_old(a,b,c,d) \
+  do { \
+	a += b; d ^= a; d = ROTR64(d, 32); \
+	c += d; b ^= c; b = ROTR64(b, 24); \
+	a += b; d ^= a; d = ROTR64(d, 16); \
+	c += d; b ^= c; b = ROTR64(b, 63); \
+\
+  } while (0)
+
+
+/*One Round of the Blake2b's compression function*/
+
+#define round_lyra(s)  \
+ do { \
+	 G(s[0].x, s[1].x, s[2].x, s[3].x); \
+     G(s[0].y, s[1].y, s[2].y, s[3].y); \
+     G(s[0].z, s[1].z, s[2].z, s[3].z); \
+     G(s[0].w, s[1].w, s[2].w, s[3].w); \
+     G(s[0].x, s[1].y, s[2].z, s[3].w); \
+     G(s[0].y, s[1].z, s[2].w, s[3].x); \
+     G(s[0].z, s[1].w, s[2].x, s[3].y); \
+     G(s[0].w, s[1].x, s[2].y, s[3].z); \
+ } while(0)
+
+
+
+void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix)
+{
+
+	 ulong4 state1[3];
+	 uint ps1 = 0;
+	 uint ps2 = (memshift * 3 + memshift * 4);
+//#pragma unroll 4
+	 for (int i = 0; i < 4; i++)
+	 {
+		 uint s1 = ps1 + i*memshift;
+		 uint s2 = ps2 - i*memshift;
+
+		 for (int j = 0; j < 3; j++)  state1[j] = (DMatrix)[j + s1];
+
+		 for (int j = 0; j < 3; j++)  state[j] ^= state1[j];
+		 round_lyra(state);
+		 for (int j = 0; j < 3; j++)  state1[j] ^= state[j];
+
+		 for (int j = 0; j < 3; j++)  (DMatrix)[j + s2] = state1[j];
+	 }
+
+}
+
+
+
+void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix)
+{
+
+ulong4 state1[3], state2[3];
+uint ps1 = (memshift * 4 * rowIn);
+uint ps2 = (memshift * 4 * rowInOut);
+uint ps3 = (memshift * 4 * rowOut);
+
+
+  for (int i = 0; i < 4; i++)
+ {
+  uint s1 = ps1 + i*memshift;
+  uint s2 = ps2 + i*memshift;
+  uint s3 = ps3 + i*memshift;
+
+
+		 for (int j = 0; j < 3; j++)   state1[j] = (DMatrix)[j + s1];
+
+         for (int j = 0; j < 3; j++)   state2[j] = (DMatrix)[j + s2];
+
+         for (int j = 0; j < 3; j++)   state1[j] += state2[j];
+
+         for (int j = 0; j < 3; j++)   state[j] ^= state1[j];
+
+
+         round_lyra(state);
+
+         ((ulong*)state2)[0] ^= ((ulong*)state)[11];
+  for (int j = 0; j < 11; j++)
+	  ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
+
+         if (rowInOut != rowOut) {
+			 for (int j = 0; j < 3; j++)
+				 (DMatrix)[j + s2] = state2[j];
+			 for (int j = 0; j < 3; j++)
+				 (DMatrix)[j + s3] ^= state[j];
+  		 }
+		 else {
+			 for (int j = 0; j < 3; j++)
+				 state2[j] ^= state[j];
+			 for (int j = 0; j < 3; j++)
+				 (DMatrix)[j + s2] = state2[j];
+		 }
+
+ }
+  }
+
+
+
+
+void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) {
+
+	 ulong4 state2[3], state1[3];
+	 uint ps1 = (memshift * 4 * rowIn);
+	 uint ps2 = (memshift * 4 * rowInOut);
+	 uint ps3 = (memshift * 3 + memshift * 4 * rowOut);
+
+	 for (int i = 0; i < 4; i++)
+	 {
+		 uint s1 = ps1 + i*memshift;
+		 uint s2 = ps2 + i*memshift;
+		 uint s3 = ps3 - i*memshift;
+
+		 for (int j = 0; j < 3; j++)  state1[j] = (DMatrix)[j + s1];
+
+		 for (int j = 0; j < 3; j++)  state2[j] = (DMatrix)[j + s2];
+		 for (int j = 0; j < 3; j++) {
+			 ulong4 tmp = state1[j] + state2[j];
+			 state[j] ^= tmp;
+		 		 }
+		 round_lyra(state);
+
+		 for (int j = 0; j < 3; j++) {
+			 state1[j] ^= state[j];
+			 (DMatrix)[j + s3] = state1[j];
+		 		 }
+
+		 ((ulong*)state2)[0] ^= ((ulong*)state)[11];
+		 for (int j = 0; j < 11; j++)
+			 ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
+		 for (int j = 0; j < 3; j++)
+			 (DMatrix)[j + s2] = state2[j];
+	 }
+   }
+
diff --git a/kernel/yescrypt-multi.cl b/kernel/yescrypt-multi.cl
new file mode 100644
index 00000000..3af7b28a
--- /dev/null
+++ b/kernel/yescrypt-multi.cl
@@ -0,0 +1,314 @@
+/*
+* "yescrypt" kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+*
+* Copyright (c) 2015  djm34
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+#if !defined(cl_khr_byte_addressable_store)
+#error "Device does not support unaligned stores"
+#endif
+
+#include "yescrypt_essential.cl"
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, __global uchar* buff3, const uint target)
+{
+
+	__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+
+	uint nonce = (get_global_id(0));
+	uint  data[20];
+	uint16 in;
+	uint8 state1, state2;
+//	uint8 sha256tokeep;
+
+//	ulong16 Bdev[8]; // will require an additional buffer
+	((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
+	((uint4  *)data)[4] = ((__global const uint4  *)input)[4];
+//	for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
+	//	if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); }
+	uint8 passwd = sha256_80(data, nonce);
+	//pbkdf		
+	in.lo = pad1.lo ^ passwd;
+	in.hi = pad1.hi;
+	state1 = sha256_Transform(in, H256);
+
+	in.lo = pad2.lo ^ passwd;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+
+	in = ((uint16*)data)[0];
+	state1 = sha256_Transform(in, state1);
+#pragma unroll 1
+	for (int i = 0; i<8; i++)
+	{
+		uint16 result;
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 1;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.lo = swapvec(sha256_Transform(in, state2));
+		if (i == 0) sha256tokeep[0] = result.lo;
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 2;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.hi = swapvec(sha256_Transform(in, state2));
+		Bdev[i].lo = as_ulong8(shuffle(result));
+//		Bdev[i].lo = as_ulong8(result);
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 3;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.lo = swapvec(sha256_Transform(in, state2));
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 4;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.hi = swapvec(sha256_Transform(in, state2));
+
+
+		Bdev[i].hi = as_ulong8(shuffle(result));
+//		Bdev[i].hi = as_ulong8(result);
+	}
+
+	//mixing1
+
+	prevstate[0] = Bdev[0];
+	Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+	prevstate[1] = Bdev[0];
+	Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+
+	uint n = 1;
+#pragma unroll 1
+	for (uint i = 2; i < 64; i++)
+	{
+
+		prevstate[i] = Bdev[0];
+
+		if ((i&(i - 1)) == 0) n = n << 1;
+
+		uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1);
+
+		j += i - n;
+		Bdev[0] ^= prevstate[j];
+
+		Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+	}
+
+
+}
+
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search1(__global uchar *buffer1, __global uchar *buffer2)
+{
+}
+
+
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search2(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2)
+{
+
+			__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+			__global ulong16* prevstate =  (__global ulong16 *)(buff1 +      (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+			__global ulong16 *Bdev =       (__global ulong16 *)(buff2 +       (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+			for (int i = 0; i<8; i++)
+				hashbuffer[i] = Bdev[i];
+
+			blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+
+
+			for (int i = 0; i<8; i++)
+				hashbuffer[i + 8] = Bdev[i];
+
+			blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+			int n = 1;
+#pragma unroll 1
+			for (int i = 2; i < 2048; i ++)
+			{
+
+				for (int k = 0; k<8; k++)
+					(hashbuffer + 8 * i)[k] = Bdev[k];
+
+
+				if ((i&(i - 1)) == 0) n = n << 1;
+
+				uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1);
+				j += i - n;
+
+				for (int k = 0; k < 8; k++)
+					Bdev[k] ^= (hashbuffer + 8 * j)[k];
+
+
+				blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+            }
+}
+
+/*
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search3(__global uchar *buffer1, __global uchar *buffer2)
+{
+}
+*/
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search3(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2)
+{
+
+	__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+#pragma unroll 1
+	for (int z = 0; z < 684; z++)
+	{
+
+		uint j = as_uint2(Bdev[7].hi.s0).x & 2047;
+
+
+		for (int k = 0; k < 8; k++)
+			Bdev[k] ^= (hashbuffer + 8 * j)[k];
+
+		if (z<682)
+			for (int k = 0; k<8; k++)
+				(hashbuffer + 8 * j)[k] = Bdev[k];
+
+		blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+////
+	}
+
+}
+
+/*
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search5(__global uchar *buffer1, __global uchar *buffer2)
+{
+}
+*/
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search4(__global const uchar* restrict input, __global uint* restrict output, __global uchar *buff2,__global uchar* buff3, const uint target)
+{
+
+	__global ulong16 *Bdev =     (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+	uint nonce = (get_global_id(0));
+
+
+	uint  data[20];
+	((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
+	((uint4  *)data)[4] = ((__global const uint4  *)input)[4];
+//	for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
+	uint8 swpass = swapvec(sha256tokeep[0]);
+    uint16 in;
+    uint8 state1,state2;
+	in.lo = pad1.lo ^ swpass;
+	in.hi = pad1.hi;
+
+
+	state1 = sha256_Transform(in, H256);
+
+	in.lo = pad2.lo ^ swpass;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+
+#pragma unroll 1
+	for (int i = 0; i<8; i++) {
+		in = unshuffle(Bdev[i].lo);
+		in = swapvec16(in);
+		state1 = sha256_Transform(in, state1);
+		in = unshuffle(Bdev[i].hi);
+		in = swapvec16(in);
+		state1 = sha256_Transform(in, state1);
+	}
+	in = pad5;
+	state1 = sha256_Transform(in, state1);
+	in.lo = state1;
+	in.hi = pad4;
+	uint8 res = sha256_Transform(in, state2);
+
+	//hmac and final sha
+
+	in.lo = pad1.lo ^ res;
+	in.hi = pad1.hi;
+	state1 = sha256_Transform(in, H256);
+	in.lo = pad2.lo ^ res;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+	in = ((uint16*)data)[0];
+	state1 = sha256_Transform(in, state1);
+	in = padsha80;
+	in.s0 = data[16];
+	in.s1 = data[17];
+	in.s2 = data[18];
+	in.s3 = get_global_id(0);
+	in.sf = 0x480;
+	state1 = sha256_Transform(in, state1);
+	in.lo = state1;
+	in.hi = pad4;
+	state1 = sha256_Transform(in, state2);
+	//		state2 = H256;
+	in.lo = state1;
+	in.hi = pad4;
+	in.sf = 0x100;
+	res = sha256_Transform(in, H256);
+
+
+	if (SWAP32(res.s7) <= (target)) 
+		output[atomic_inc(output + 0xFF)] = (nonce);
+	
+}
diff --git a/kernel/yescrypt.cl b/kernel/yescrypt.cl
new file mode 100644
index 00000000..0a94ebca
--- /dev/null
+++ b/kernel/yescrypt.cl
@@ -0,0 +1,253 @@
+/*
+* "yescrypt" kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+*
+* Copyright (c) 2015  djm34
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+#if !defined(cl_khr_byte_addressable_store)
+#error "Device does not support unaligned stores"
+#endif
+
+#include "yescrypt_essential.cl"
+
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, const uint target)
+{
+
+	__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+
+
+
+	uint nonce = (get_global_id(0));
+	uint  data[20];
+	uint16 in;
+	uint8 state1, state2;
+    uint8 sha256tokeep;
+
+	((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
+	((uint4  *)data)[4] = ((__global const uint4  *)input)[4];
+	for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
+	//	if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); }
+	uint8 passwd = sha256_80(data, nonce);
+	//pbkdf		
+	in.lo = pad1.lo ^ passwd;
+	in.hi = pad1.hi;
+	state1 = sha256_Transform(in, H256);
+
+	in.lo = pad2.lo ^ passwd;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+
+	in = ((uint16*)data)[0];
+	state1 = sha256_Transform(in, state1);
+#pragma unroll 1
+	for (int i = 0; i<8; i++)
+	{
+		uint16 result;
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 1;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.lo = swapvec(sha256_Transform(in, state2));
+		if (i == 0) sha256tokeep = result.lo;
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 2;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.hi = swapvec(sha256_Transform(in, state2));
+		Bdev[i].lo = as_ulong8(shuffle(result));
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 3;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.lo = swapvec(sha256_Transform(in, state2));
+		in = pad3;
+		in.s0 = data[16];
+		in.s1 = data[17];
+		in.s2 = data[18];
+		in.s3 = nonce;
+		in.s4 = 4 * i + 4;
+		in.lo = sha256_Transform(in, state1);
+		in.hi = pad4;
+		result.hi = swapvec(sha256_Transform(in, state2));
+
+
+		Bdev[i].hi = as_ulong8(shuffle(result));
+	}
+
+	//mixing1
+
+	prevstate[0] = Bdev[0];
+	Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+	prevstate[1] = Bdev[0];
+	Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+
+	uint n = 1;
+#pragma unroll 1
+	for (uint i = 2; i < 64; i++)
+	{
+
+		prevstate[i] = Bdev[0];
+
+		if ((i&(i - 1)) == 0) n = n << 1;
+
+		uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1);
+
+		j += i - n;
+		Bdev[0] ^= prevstate[j];
+
+		Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
+	}
+
+
+	for (int i = 0; i<8; i++)
+		hashbuffer[i] = Bdev[i];
+
+	blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+
+
+	for (int i = 0; i<8; i++)
+		hashbuffer[i + 8] = Bdev[i];
+
+	blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+    n = 1;
+#pragma unroll 1
+	for (int i = 2; i < 2048; i++)
+	{
+
+		for (int k = 0; k<8; k++)
+			(hashbuffer + 8 * i)[k] = Bdev[k];
+
+
+		if ((i&(i - 1)) == 0) n = n << 1;
+
+		uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1);
+		j += i - n;
+
+		for (int k = 0; k < 8; k++)
+			Bdev[k] ^= (hashbuffer + 8 * j)[k];
+
+
+		blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+	}
+
+
+#pragma unroll 1
+	for (int z = 0; z < 684; z++)
+	{
+
+		uint j = as_uint2(Bdev[7].hi.s0).x & 2047;
+
+
+		for (int k = 0; k < 8; k++)
+			Bdev[k] ^= (hashbuffer + 8 * j)[k];
+
+		if (z<682)
+			for (int k = 0; k<8; k++)
+				(hashbuffer + 8 * j)[k] = Bdev[k];
+
+		blockmix_pwxform((__global ulong8*)Bdev, prevstate);
+		////
+	}
+
+
+
+	uint8 swpass = swapvec(sha256tokeep);
+//	uint16 in;
+//	uint8 state1, state2;
+	in.lo = pad1.lo ^ swpass;
+	in.hi = pad1.hi;
+
+
+	state1 = sha256_Transform(in, H256);
+
+	in.lo = pad2.lo ^ swpass;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+
+#pragma unroll 1
+	for (int i = 0; i<8; i++) {
+		in = unshuffle(Bdev[i].lo);
+		in = swapvec16(in);
+		state1 = sha256_Transform(in, state1);
+		in = unshuffle(Bdev[i].hi);
+		in = swapvec16(in);
+		state1 = sha256_Transform(in, state1);
+	}
+	in = pad5;
+	state1 = sha256_Transform(in, state1);
+	in.lo = state1;
+	in.hi = pad4;
+	uint8 res = sha256_Transform(in, state2);
+
+	//hmac and final sha
+
+	in.lo = pad1.lo ^ res;
+	in.hi = pad1.hi;
+	state1 = sha256_Transform(in, H256);
+	in.lo = pad2.lo ^ res;
+	in.hi = pad2.hi;
+	state2 = sha256_Transform(in, H256);
+	in = ((uint16*)data)[0];
+	state1 = sha256_Transform(in, state1);
+	in = padsha80;
+	in.s0 = data[16];
+	in.s1 = data[17];
+	in.s2 = data[18];
+	in.s3 = get_global_id(0);
+	in.sf = 0x480;
+	state1 = sha256_Transform(in, state1);
+	in.lo = state1;
+	in.hi = pad4;
+	state1 = sha256_Transform(in, state2);
+	//		state2 = H256;
+	in.lo = state1;
+	in.hi = pad4;
+	in.sf = 0x100;
+	res = sha256_Transform(in, H256);
+
+
+	if (SWAP32(res.s7) <= (target))
+		output[atomic_inc(output + 0xFF)] = (nonce);
+
+}
+
diff --git a/kernel/yescrypt_essential.cl b/kernel/yescrypt_essential.cl
new file mode 100644
index 00000000..ba1816a8
--- /dev/null
+++ b/kernel/yescrypt_essential.cl
@@ -0,0 +1,760 @@
+/*
+* "yescrypt" kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+*
+* Copyright (c) 2015  djm34
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+
+#define ROL32(x, n)  rotate(x, (uint) n)
+#define SWAP32(a)    (as_uint(as_uchar4(a).wzyx))
+//#define ROL32(x, n)   (((x) << (n)) | ((x) >> (32 - (n))))
+#define HASH_MEMORY 4096
+
+
+#define SALSA(a,b,c,d) do { \
+    t =a+d; b^=ROL32(t,  7U);    \
+    t =b+a; c^=ROL32(t,  9U);    \
+    t =c+b; d^=ROL32(t, 13U);    \
+    t =d+c; a^=ROL32(t, 18U);     \
+} while(0)
+
+
+#define SALSA_CORE(state) do { \
+\
+SALSA(state.s0,state.s4,state.s8,state.sc); \
+SALSA(state.s5,state.s9,state.sd,state.s1); \
+SALSA(state.sa,state.se,state.s2,state.s6); \
+SALSA(state.sf,state.s3,state.s7,state.sb); \
+SALSA(state.s0,state.s1,state.s2,state.s3); \
+SALSA(state.s5,state.s6,state.s7,state.s4); \
+SALSA(state.sa,state.sb,state.s8,state.s9); \
+SALSA(state.sf,state.sc,state.sd,state.se); \
+	} while(0)
+
+#define uSALSA_CORE(state) do { \
+\
+SALSA(state.s0,state.s4,state.s8,state.sc); \
+SALSA(state.s1,state.s5,state.s9,state.sd); \
+SALSA(state.s2,state.s6,state.sa,state.se); \
+SALSA(state.s3,state.s7,state.sb,state.sf); \
+SALSA(state.s0,state.sd,state.sa,state.s7); \
+SALSA(state.s1,state.se,state.sb,state.s4); \
+SALSA(state.s2,state.sf,state.s8,state.s5); \
+SALSA(state.s3,state.sc,state.s9,state.s6); \
+} while(0)
+
+
+#define unshuffle(state) (as_uint16(state).s0da741eb852fc963)
+
+#define   shuffle(state) (as_uint16(state).s05af49e38d27c16b)
+
+static __constant uint16 pad1 = 
+{
+	0x36363636, 0x36363636, 0x36363636, 0x36363636,
+	0x36363636, 0x36363636, 0x36363636, 0x36363636,
+	0x36363636, 0x36363636, 0x36363636, 0x36363636,
+	0x36363636, 0x36363636, 0x36363636, 0x36363636
+};
+
+static __constant uint16 pad2 = 
+{
+	0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
+	0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
+	0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
+	0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c
+};
+
+static __constant uint16 pad5 =
+{
+	0x00000001, 0x80000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00002220
+};
+
+static __constant uint16 pad3 =
+{
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x80000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x000004a0
+};
+
+static __constant uint16 padsha80 =
+{
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000280
+};
+
+static __constant uint8 pad4 =
+{
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000300
+};
+
+
+
+static __constant  uint8 H256 = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372,
+	0xA54FF53A, 0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+inline uint8 swapvec(uint8 buf)
+{
+	uint8 vec;
+	vec.s0 = SWAP32(buf.s0);
+	vec.s1 = SWAP32(buf.s1);
+	vec.s2 = SWAP32(buf.s2);
+	vec.s3 = SWAP32(buf.s3);
+	vec.s4 = SWAP32(buf.s4);
+	vec.s5 = SWAP32(buf.s5);
+	vec.s6 = SWAP32(buf.s6);
+	vec.s7 = SWAP32(buf.s7);
+	return vec;
+}
+
+
+
+inline uint16 swapvec16(uint16 buf)
+{
+	uint16 vec;
+	vec.s0 = SWAP32(buf.s0);
+	vec.s1 = SWAP32(buf.s1);
+	vec.s2 = SWAP32(buf.s2);
+	vec.s3 = SWAP32(buf.s3);
+	vec.s4 = SWAP32(buf.s4);
+	vec.s5 = SWAP32(buf.s5);
+	vec.s6 = SWAP32(buf.s6);
+	vec.s7 = SWAP32(buf.s7);
+	vec.s8 = SWAP32(buf.s8);
+	vec.s9 = SWAP32(buf.s9);
+	vec.sa = SWAP32(buf.sa);
+	vec.sb = SWAP32(buf.sb);
+	vec.sc = SWAP32(buf.sc);
+	vec.sd = SWAP32(buf.sd);
+	vec.se = SWAP32(buf.se);
+	vec.sf = SWAP32(buf.sf);
+	return vec;
+}
+
+ ulong8 salsa20_8(uint16 Bx)
+{
+uint t;
+	uint16 st = Bx;
+	uSALSA_CORE(st);
+	uSALSA_CORE(st);
+	uSALSA_CORE(st);
+	uSALSA_CORE(st);
+	return(as_ulong8(st + Bx));
+}
+
+ ulong8 salsa20_8n(uint16 Bx)
+ {
+	 uint t;
+	 uint16 st = Bx;
+	 SALSA_CORE(st);
+	 SALSA_CORE(st);
+	 SALSA_CORE(st);
+	 SALSA_CORE(st);
+	 return(as_ulong8(st + Bx));
+ }
+
+
+ ulong16 blockmix_salsa8_small2(ulong16 Bin)
+{
+	ulong8 X = Bin.hi;
+	X ^= Bin.lo;
+	X = salsa20_8(as_uint16(X));
+	Bin.lo = X;
+	X ^= Bin.hi;
+	X = salsa20_8(as_uint16(X));
+	Bin.hi = X;
+	return(Bin);
+}
+/*
+ uint16 salsa20_8_2(uint16 Bx)
+ {
+	 uint t;
+	 uint16 st = Bx;
+	 uSALSA_CORE(st);
+	 uSALSA_CORE(st);
+	 uSALSA_CORE(st);
+	 uSALSA_CORE(st);
+	 return(st + Bx);
+ }
+
+ ulong16 blockmix_salsa8_small2(ulong16 Bin)
+ {
+	 uint16 X = as_uint16(Bin.hi);
+	 X ^= as_uint16(Bin.lo);
+	 X = salsa20_8_2(as_uint16(X));
+	 Bin.lo = as_ulong8(X);
+	 X ^= as_uint16(Bin.hi);
+	 X = salsa20_8_2(as_uint16(X));
+	 Bin.hi = as_ulong8(X);
+	 return(Bin);
+ }
+*/
+
+
+inline ulong2 madd4long2(uint4 a, uint4 b)
+{
+	uint4 result;
+	result.x = a.x*a.y + b.x;
+	result.y = b.y + mad_hi(a.x, a.y, b.x);
+	result.z = a.z*a.w + b.z;
+	result.w = b.w + mad_hi(a.z, a.w, b.z);
+	return as_ulong2(result);
+}
+
+inline ulong2 madd4long3(uint4 a, ulong2 b)
+{
+	ulong2 result;
+	result.x = (ulong)a.x*(ulong)a.y + b.x;
+	result.y = (ulong)a.z*(ulong)a.w + b.y;
+	return result;
+}
+
+
+inline ulong8 block_pwxform_long_old(ulong8 Bout, __global ulong16 *prevstate)
+{
+
+		ulong2 vec = Bout.lo.lo;
+
+		for (int i = 0; i < 6; i++)
+		{
+			ulong2 p0, p1;
+			uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+			p0 = ((__global ulong2*)(prevstate ))[x.x];
+			vec = madd4long3(as_uint4(vec), p0);
+			p1 = ((__global  ulong2*)(prevstate + 32))[x.y];
+
+			vec ^= p1;
+		}
+		Bout.lo.lo = vec;
+        vec = Bout.lo.hi;
+		for (int i = 0; i < 6; i++)
+		{
+
+			ulong2 p0, p1;
+			uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+			p0 = ((__global  ulong2*)(prevstate))[x.x];
+			vec = madd4long3(as_uint4(vec), p0);
+			p1 = ((__global ulong2*)(prevstate + 32))[x.y];
+
+			vec ^= p1;
+		}
+		Bout.lo.hi = vec;
+
+		vec = Bout.hi.lo;
+		for (int i = 0; i < 6; i++)
+		{
+			ulong2 p0, p1;
+			uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+			p0 = ((__global  ulong2*)(prevstate))[x.x];
+			vec = madd4long3(as_uint4(vec), p0);
+			p1 = ((__global  ulong2*)(prevstate + 32))[x.y];
+			vec ^= p1;
+		}
+		Bout.hi.lo = vec;
+		vec = Bout.hi.hi;
+		for (int i = 0; i < 6; i++)
+		{
+			ulong2 p0, p1;
+			uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+			p0 = ((__global  ulong2*)(prevstate))[x.x];
+			vec = madd4long3(as_uint4(vec), p0);
+			p1 = ((__global  ulong2*)(prevstate + 32))[x.y];
+
+			vec ^= p1;
+		}
+		Bout.hi.hi = vec;
+
+		return(Bout);
+}
+
+inline ulong8 block_pwxform_long(ulong8 Bout, __global ulong2 *prevstate)
+{
+
+	ulong2 vec = Bout.lo.lo;
+
+	for (int i = 0; i < 6; i++)
+	{
+		ulong2 p0, p1;
+		uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+		p0 = prevstate[x.x];
+		vec = madd4long3(as_uint4(vec), p0);
+		p1 = (prevstate + 32*8)[x.y];
+
+		vec ^= p1;
+	}
+	Bout.lo.lo = vec;
+	vec = Bout.lo.hi;
+	for (int i = 0; i < 6; i++)
+	{
+
+		ulong2 p0, p1;
+		uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+		p0 = prevstate[x.x];
+		vec = madd4long3(as_uint4(vec), p0);
+		p1 = (prevstate + 32 * 8)[x.y];
+
+		vec ^= p1;
+	}
+	Bout.lo.hi = vec;
+
+	vec = Bout.hi.lo;
+	for (int i = 0; i < 6; i++)
+	{
+		ulong2 p0, p1;
+		uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+		p0 = prevstate[x.x];
+		vec = madd4long3(as_uint4(vec), p0);
+		p1 = (prevstate + 32 * 8)[x.y];
+		vec ^= p1;
+	}
+	Bout.hi.lo = vec;
+	vec = Bout.hi.hi;
+	for (int i = 0; i < 6; i++)
+	{
+		ulong2 p0, p1;
+		uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
+		p0 = prevstate[x.x];
+		vec = madd4long3(as_uint4(vec), p0);
+		p1 = (prevstate + 32 * 8)[x.y];
+
+		vec ^= p1;
+	}
+	Bout.hi.hi = vec;
+
+	return(Bout);
+}
+
+
+
+
+inline void blockmix_pwxform(__global ulong8 *Bin, __global  ulong16 *prevstate)
+{
+	Bin[0] ^= Bin[15];
+	Bin[0] = block_pwxform_long_old(Bin[0], prevstate);
+#pragma unroll 1
+	for (int i = 1; i < 16; i++)
+	{
+		Bin[i] ^= Bin[i - 1];
+		Bin[i] = block_pwxform_long_old(Bin[i], prevstate);
+	}
+	Bin[15] = salsa20_8(as_uint16(Bin[15]));
+}
+
+#define SHR(x, n)    ((x) >> n)
+
+
+#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^  SHR(x, 3))
+#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^  SHR(x, 10))
+
+#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10))
+#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7))
+
+#define P(a,b,c,d,e,f,g,h,x,K)                  \
+{                                               \
+    temp1 = h + S3(e) + F1(e,f,g) + (K + x);      \
+    d += temp1; h = temp1 + S2(a) + F0(a,b,c);  \
+}
+
+#define PLAST(a,b,c,d,e,f,g,h,x,K)                  \
+{                                               \
+    d += h + S3(e) + F1(e,f,g) + (x + K);              \
+}
+
+#define F0(y, x, z) bitselect(z, y, z ^ x)
+#define F1(x, y, z) bitselect(z, y, x)
+
+#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0)
+#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1)
+#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2)
+#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3)
+#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4)
+#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5)
+#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6)
+#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7)
+#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8)
+#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9)
+#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10)
+#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11)
+#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12)
+#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13)
+#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14)
+#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15)
+
+#define RD14 (S1(W12) + W7 + S0(W15) + W14)
+#define RD15 (S1(W13) + W8 + S0(W0) + W15)
+
+/// generic sha transform
+inline uint8 sha256_Transform(uint16 data, uint8 state)
+{
+uint temp1;
+    uint8 res = state;
+	uint W0 = data.s0;
+	uint W1 = data.s1;
+	uint W2 = data.s2;
+	uint W3 = data.s3;
+	uint W4 = data.s4;
+	uint W5 = data.s5;
+	uint W6 = data.s6;
+	uint W7 = data.s7;
+	uint W8 = data.s8;
+	uint W9 = data.s9;
+	uint W10 = data.sA;
+	uint W11 = data.sB;
+	uint W12 = data.sC;
+	uint W13 = data.sD;
+	uint W14 = data.sE;
+	uint W15 = data.sF;
+
+#define v0  res.s0
+#define v1  res.s1
+#define v2  res.s2
+#define v3  res.s3
+#define v4  res.s4
+#define v5  res.s5
+#define v6  res.s6
+#define v7  res.s7
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef v4
+#undef v5
+#undef v6
+#undef v7
+	return (res+state);
+}
+
+
+static inline uint8 sha256_round1(uint16 data)
+{
+	uint temp1;
+    uint8 res;
+	uint W0 = data.s0;
+	uint W1 = data.s1;
+	uint W2 = data.s2;
+	uint W3 = data.s3;
+	uint W4 = data.s4;
+	uint W5 = data.s5;
+	uint W6 = data.s6;
+	uint W7 = data.s7;
+	uint W8 = data.s8;
+	uint W9 = data.s9;
+	uint W10 = data.sA;
+	uint W11 = data.sB;
+	uint W12 = data.sC;
+	uint W13 = data.sD;
+	uint W14 = data.sE;
+	uint W15 = data.sF;
+
+	uint v0 = 0x6A09E667;
+	uint v1 = 0xBB67AE85;
+	uint v2 = 0x3C6EF372;
+	uint v3 = 0xA54FF53A;
+	uint v4 = 0x510E527F;
+	uint v5 = 0x9B05688C;
+	uint v6 = 0x1F83D9AB;
+	uint v7 = 0x5BE0CD19;
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
+
+	res.s0 = v0 + 0x6A09E667;
+	res.s1 = v1 + 0xBB67AE85;
+	res.s2 = v2 + 0x3C6EF372;
+	res.s3 = v3 + 0xA54FF53A;
+	res.s4 = v4 + 0x510E527F;
+	res.s5 = v5 + 0x9B05688C;
+	res.s6 = v6 + 0x1F83D9AB;
+	res.s7 = v7 + 0x5BE0CD19;
+	return (res);
+}
+
+
+static inline uint8 sha256_round2(uint16 data,uint8 buf)
+{
+	uint temp1;
+	uint8 res;
+	uint W0 = data.s0;
+	uint W1 = data.s1;
+	uint W2 = data.s2;
+	uint W3 = data.s3;
+	uint W4 = data.s4;
+	uint W5 = data.s5;
+	uint W6 = data.s6;
+	uint W7 = data.s7;
+	uint W8 = data.s8;
+	uint W9 = data.s9;
+	uint W10 = data.sA;
+	uint W11 = data.sB;
+	uint W12 = data.sC;
+	uint W13 = data.sD;
+	uint W14 = data.sE;
+	uint W15 = data.sF;
+
+	uint v0 = buf.s0;
+	uint v1 = buf.s1;
+	uint v2 = buf.s2;
+	uint v3 = buf.s3;
+	uint v4 = buf.s4;
+	uint v5 = buf.s5;
+	uint v6 = buf.s6;
+	uint v7 = buf.s7;
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
+
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
+	P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
+	P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
+	P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
+	P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
+	P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
+	P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
+	P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
+	P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
+
+	res.s0 = (v0 + buf.s0);
+	res.s1 = (v1 + buf.s1);
+	res.s2 = (v2 + buf.s2);
+	res.s3 = (v3 + buf.s3);
+	res.s4 = (v4 + buf.s4);
+	res.s5 = (v5 + buf.s5);
+	res.s6 = (v6 + buf.s6);
+	res.s7 = (v7 + buf.s7);
+	return (res);
+}
+
+static inline uint8 sha256_80(uint* data,uint nonce)
+{
+
+uint8 buf = sha256_round1( ((uint16*)data)[0]);
+uint16 in = padsha80;
+in.s0 = data[16];
+in.s1 = data[17];
+in.s2 = data[18];
+in.s3 = nonce;
+
+return(sha256_round2(in,buf));
+}
+
diff --git a/miner.h b/miner.h
index 45985779..ecaa6a13 100644
--- a/miner.h
+++ b/miner.h
@@ -733,6 +733,17 @@ static inline void flip128(void *dest_p, const void *src_p)
     dest[i] = swab32(src[i]);
 }
 
+static inline void flip168(void *dest_p, const void *src_p)
+{
+	uint32_t *dest = (uint32_t *)dest_p;
+	const uint32_t *src = (uint32_t *)src_p;
+	int i;
+
+	for (i = 0; i < 42; i++)
+		dest[i] = swab32(src[i]);
+}
+
+
 /* For flipping to the correct endianness if necessary */
 #if defined(__BIG_ENDIAN__) || defined(MIPSEB)
 static inline void endian_flip32(void *dest_p, const void *src_p)
@@ -744,6 +755,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p)
 {
   flip128(dest_p, src_p);
 }
+static inline void endian_flip168(void *dest_p, const void *src_p)
+{
+	flip168(dest_p, src_p);
+}
+
 #else
 static inline void
 endian_flip32(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
@@ -754,8 +770,13 @@ static inline void
 endian_flip128(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
 {
 }
+static inline void
+endian_flip168(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
+{
+}
 #endif
 
+
 extern double cgpu_runtime(struct cgpu_info *cgpu);
 extern void _quit(int status);
 
@@ -1146,8 +1167,8 @@ extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user
 #define MAX_GPUDEVICES 16
 #define MAX_DEVICES 4096
 
-#define MIN_INTENSITY 8
-#define MIN_INTENSITY_STR "8"
+#define MIN_INTENSITY 4
+#define MIN_INTENSITY_STR "4"
 #define MAX_INTENSITY 31
 #define MAX_INTENSITY_STR "31"
 #define MIN_XINTENSITY 1
@@ -1416,7 +1437,7 @@ struct pool {
 #define GETWORK_MODE_GBT 'G'
 
 struct work {
-  unsigned char data[128];
+  unsigned char data[168];
   unsigned char midstate[32];
   unsigned char target[32];
   unsigned char hash[32];
diff --git a/ocl.c b/ocl.c
index 65e34e75..bd8a1527 100644
--- a/ocl.c
+++ b/ocl.c
@@ -36,6 +36,8 @@
 #include "ocl/binary_kernel.h"
 #include "algorithm/neoscrypt.h"
 #include "algorithm/pluck.h"
+#include "algorithm/yescrypt.h"
+#include "algorithm/lyra2re.h"
 
 /* FIXME: only here for global config vars, replace with configuration.h
  * or similar as soon as config is in a struct instead of littered all
@@ -414,8 +416,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
 
   }
 
-  /////////////////////////////////// pluck 
-  // neoscrypt TC
+  // pluck TC
   else if (!safe_cmp(cgpu->algorithm.name, "pluck") && !cgpu->opt_tc) {
     size_t glob_thread_count;
     long max_int;
@@ -497,7 +498,175 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     cgpu->thread_concurrency = glob_thread_count;
 
     applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
+  }
+
+  // Yescrypt TC
+  else if ((!safe_cmp(cgpu->algorithm.name, "yescrypt") ||
+            !safe_cmp(algorithm->name, "yescrypt-multi")) && !cgpu->opt_tc) {
+    size_t glob_thread_count;
+    long max_int;
+    unsigned char type = 0;
+
+      // determine which intensity type to use
+    if (cgpu->rawintensity > 0) {
+      glob_thread_count = cgpu->rawintensity;
+      max_int = glob_thread_count;
+      type = 2;
+    }
+    else if (cgpu->xintensity > 0) {
+      glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity);
+      max_int = cgpu->xintensity;
+      type = 1;
+    }
+    else {
+      glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
+      max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
+    }
 
+    glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count);
+
+    // if TC * scratchbuf size is too big for memory... reduce to max
+    if ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
+
+      /* Selected intensity will not run on this GPU. Not enough memory.
+      * Adapt the memory setting. */
+      // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
+      switch (type) {
+        //raw intensity
+      case 2:
+        while ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
+          --glob_thread_count;
+        }
+
+        max_int = glob_thread_count;
+        cgpu->rawintensity = glob_thread_count;
+        break;
+
+        //x intensity
+      case 1:
+        glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE;
+        max_int = glob_thread_count / clState->compute_shaders;
+
+        while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
+          --max_int;
+        }
+
+        /* Check if max_intensity is >0. */
+        if (max_int < MIN_XINTENSITY) {
+          applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
+          max_int = MIN_XINTENSITY;
+        }
+
+        cgpu->xintensity = max_int;
+        glob_thread_count = clState->compute_shaders * (1UL << max_int);
+        break;
+
+      default:
+        glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE;
+        while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
+          --max_int;
+        }
+
+        /* Check if max_intensity is >0. */
+        if (max_int < MIN_INTENSITY) {
+          applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
+          max_int = MIN_INTENSITY;
+        }
+
+        cgpu->intensity = max_int;
+        glob_thread_count = 1UL << max_int;
+        break;
+      }
+    }
+
+    // TC is glob thread count
+    cgpu->thread_concurrency = glob_thread_count;
+
+    applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
+  }
+
+  // Lyra2re v2 TC
+  else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) {
+    size_t glob_thread_count;
+    long max_int;
+    unsigned char type = 0;
+
+    // determine which intensity type to use
+    if (cgpu->rawintensity > 0) {
+      glob_thread_count = cgpu->rawintensity;
+      max_int = glob_thread_count;
+      type = 2;
+    }
+    else if (cgpu->xintensity > 0) {
+      glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity);
+      max_int = cgpu->xintensity;
+      type = 1;
+    }
+    else {
+      glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
+      max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
+    }
+
+    glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count);
+
+    // if TC * scratchbuf size is too big for memory... reduce to max
+    if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
+
+      /* Selected intensity will not run on this GPU. Not enough memory.
+      * Adapt the memory setting. */
+      // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
+      switch (type) {
+        //raw intensity
+      case 2:
+        while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
+          --glob_thread_count;
+        }
+
+        max_int = glob_thread_count;
+        cgpu->rawintensity = glob_thread_count;
+        break;
+
+        //x intensity
+      case 1:
+        glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
+        max_int = glob_thread_count / clState->compute_shaders;
+
+        while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
+          --max_int;
+        }
+
+        /* Check if max_intensity is >0. */
+        if (max_int < MIN_XINTENSITY) {
+          applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
+          max_int = MIN_XINTENSITY;
+        }
+
+        cgpu->xintensity = max_int;
+        glob_thread_count = clState->compute_shaders * (1UL << max_int);
+        break;
+
+      default:
+        glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
+        while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
+          --max_int;
+        }
+
+        /* Check if max_intensity is >0. */
+        if (max_int < MIN_INTENSITY) {
+          applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
+          max_int = MIN_INTENSITY;
+        }
+
+        cgpu->intensity = max_int;
+        glob_thread_count = 1UL << max_int;
+        break;
+      }
+    }
+
+    // TC is glob thread count
+    cgpu->thread_concurrency = glob_thread_count;
+
+    applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
   }
   else if (!cgpu->opt_tc) {
     unsigned int sixtyfours;
@@ -586,7 +755,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   }
 
   size_t bufsize;
-  size_t readbufsize = 128;
+  size_t buf1size;
+  size_t buf3size;
+  size_t buf2size;
+  size_t readbufsize = (!safe_cmp(algorithm->name, "credits")) ? 168 : 128;
 
   if (algorithm->rw_buffer_size < 0) {
     // calc buffer size for neoscrypt
@@ -612,6 +784,31 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
       applog(LOG_DEBUG, "pluck buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
       // scrypt/n-scrypt
     }
+    else if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) {
+      /* The scratch/pad-buffer needs 32kBytes memory per thread. */
+      bufsize = YESCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
+      buf1size = PLUCK_SECBUF_SIZE * cgpu->thread_concurrency;
+      buf2size = 128 * 8 * 8 * cgpu->thread_concurrency;
+      buf3size= 8 * 8 * 4 * cgpu->thread_concurrency;
+      /* This is the input buffer. For yescrypt this is guaranteed to be
+      * 80 bytes only. */
+      readbufsize = 80;
+
+      applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+      // scrypt/n-scrypt
+    }
+    else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
+      /* The scratch/pad-buffer needs 32kBytes memory per thread. */
+      bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
+      buf1size = 4* 8 * cgpu->thread_concurrency; //matrix
+
+      /* This is the input buffer. For yescrypt this is guaranteed to be
+      * 80 bytes only. */
+      readbufsize = 80;
+
+      applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size);
+      // scrypt/n-scrypt
+    }
     else {
       size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
       bufsize = 128 * ipt * cgpu->thread_concurrency;
@@ -624,6 +821,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   }
 
   clState->padbuffer8 = NULL;
+  clState->buffer1 = NULL;
+  clState->buffer2 = NULL;
+  clState->buffer3 = NULL;
 
   if (bufsize > 0) {
     applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize);
@@ -635,6 +835,42 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
       applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
     }
 
+    if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) {
+      // need additionnal buffers
+      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer1) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+        return NULL;
+      }
+
+      clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer2) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
+        return NULL;
+      }
+
+      clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer3) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status);
+        return NULL;
+      }
+    }
+    else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
+      // need additionnal buffers
+      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
+      if (status != CL_SUCCESS && !clState->buffer1) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+        return NULL;
+      }
+    }
+    else {
+      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired...
+      if (status != CL_SUCCESS && !clState->buffer1) {
+        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+        return NULL;
+      }
+    }
+
     /* This buffer is weird and might work to some degree even if
      * the create buffer call has apparently failed, so check if we
      * get anything back before we call it a failure. */
diff --git a/ocl.h b/ocl.h
index 0950d2c0..8d6467f1 100644
--- a/ocl.h
+++ b/ocl.h
@@ -23,7 +23,10 @@ typedef struct __clState {
   cl_mem CLbuffer0;
   cl_mem MidstateBuf;
   cl_mem padbuffer8;
-  unsigned char cldata[80];
+  cl_mem buffer1;
+  cl_mem buffer2;
+  cl_mem buffer3;
+  unsigned char cldata[168];
   bool goffset;
   cl_uint vwidth;
   size_t max_work_size;
diff --git a/sgminer.c b/sgminer.c
index f9e3f096..0c3fed0f 100644
--- a/sgminer.c
+++ b/sgminer.c
@@ -1919,6 +1919,7 @@ static void calc_midstate(struct work *work)
   endian_flip32(work->midstate, work->midstate);
 }
 
+
 static struct work *make_work(void)
 {
   struct work *w = (struct work *)calloc(1, sizeof(struct work));
@@ -2260,7 +2261,9 @@ static bool gbt_decode(struct pool *pool, json_t *res_val)
 
 static bool getwork_decode(json_t *res_val, struct work *work)
 {
-  if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) {
+  size_t worklen = 128;
+  worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen);
+  if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) {
     if (opt_morenotices)
       applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), ""));
     return false;
@@ -3018,10 +3021,17 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
 
   cgpu = get_thr_cgpu(thr_id);
 
-  endian_flip128(work->data, work->data);
+  if (safe_cmp(work->pool->algorithm.name, "credits")) {
+    endian_flip128(work->data, work->data);
+  } else {
+    endian_flip168(work->data, work->data);
+  }
 
   /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */
-  hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data)));
+  int datasize = 128;
+  if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) datasize = 80;
+  else if (!safe_cmp(work->pool->algorithm.name, "credits")) datasize = 168;
+  hexstr = bin2hex(work->data, datasize);
 
   /* build JSON-RPC request */
   if (work->gbt) {
@@ -7060,7 +7070,10 @@ void inc_hw_errors(struct thr_info *thr)
 /* Fills in the work nonce and builds the output data in work->hash */
 static void rebuild_nonce(struct work *work, uint32_t nonce)
 {
-  uint32_t *work_nonce = (uint32_t *)(work->data + 76);
+  uint32_t nonce_pos = 76;
+  if (!safe_cmp(work->pool->algorithm.name, "credits")) nonce_pos = 140;
+
+  uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos);
 
   *work_nonce = htole32(nonce);
 
@@ -7076,7 +7089,10 @@ bool test_nonce(struct work *work, uint32_t nonce)
   rebuild_nonce(work, nonce);
 
   // for Neoscrypt, the diff1targ value is in work->target
-  if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck")) {
+  if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck")
+    || !safe_cmp(work->pool->algorithm.name, "yescrypt")
+    || !safe_cmp(work->pool->algorithm.name, "yescrypt-multi")
+  ) {
     diff1targ = ((uint32_t *)work->target)[7];
   }
   else {
diff --git a/sph/Makefile.am b/sph/Makefile.am
index d80e438a..bc2f4b23 100644
--- a/sph/Makefile.am
+++ b/sph/Makefile.am
@@ -1,3 +1,3 @@
 noinst_LIBRARIES	= libsph.a
 
-libsph_a_SOURCES	= bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c
+libsph_a_SOURCES	= bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c sha256_Y.c
diff --git a/sph/sha256_Y.c b/sph/sha256_Y.c
new file mode 100644
index 00000000..a5d786d3
--- /dev/null
+++ b/sph/sha256_Y.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "algorithm/sysendian.h"
+
+#include "sph/sha256_Y.h"
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++) {
+		state[i] += S[i];
+	
+}
+
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX_Y * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update_Y(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update_Y(ctx, len, 8);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init_Y(SHA256_CTX_Y * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
+{
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init_Y(&ctx->ictx);
+		SHA256_Update_Y(&ctx->ictx, K, Klen);
+		SHA256_Final_Y(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init_Y(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++) {
+		pad[i] ^= K[i];
+    }
+	SHA256_Update_Y(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init_Y(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+   {
+		pad[i] ^= K[i];
+   }
+	SHA256_Update_Y(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update_Y(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final_Y(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update_Y(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final_Y(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX_Y PShctx, hctx;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
diff --git a/sph/sha256_Y.h b/sph/sha256_Y.h
new file mode 100644
index 00000000..e97b81ba
--- /dev/null
+++ b/sph/sha256_Y.h
@@ -0,0 +1,63 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#include <sys/types.h>
+
+#include <stdint.h>
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX_Y;
+
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX_Y ictx;
+	SHA256_CTX_Y octx;
+} HMAC_SHA256_CTX_Y;
+
+void	SHA256_Init_Y(SHA256_CTX_Y *);
+void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
+void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void	PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
+    uint64_t, uint8_t *, size_t);
+
+
+#endif /* !_SHA256_H_ */