From 29c3f1c7144c148d3ea9ee40a510fe3c77e01d76 Mon Sep 17 00:00:00 2001
From: elbandi <bandispam@gmail.com>
Date: Mon, 21 Mar 2016 17:34:35 +0100
Subject: [PATCH] Add decred algo support

---
 Makefile.am        |   1 +
 algorithm.c        |  99 +++++++++++++++++++-------
 algorithm.h        |   1 +
 algorithm/decred.c | 170 +++++++++++++++++++++++++++++++++++++++++++++
 algorithm/decred.h |  11 +++
 kernel/decred.cl   | 161 ++++++++++++++++++++++++++++++++++++++++++
 miner.h            |  24 ++++++-
 ocl.c              |   4 +-
 ocl.h              |   2 +-
 sgminer.c          | 169 +++++++++++++++++++++++++++++++++-----------
 10 files changed, 575 insertions(+), 67 deletions(-)
 create mode 100644 algorithm/decred.c
 create mode 100644 algorithm/decred.h
 create mode 100644 kernel/decred.cl

diff --git a/Makefile.am b/Makefile.am
index c36ea1db..e8a3f3d4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -79,6 +79,7 @@ sgminer_SOURCES += algorithm/credits.c algorithm/credits.h
 sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h 
 sgminer_SOURCES += algorithm/blake256.c algorithm/blake256.h
 sgminer_SOURCES += algorithm/blakecoin.c algorithm/blakecoin.h
+sgminer_SOURCES += algorithm/decred.c algorithm/decred.h
 
 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
 
diff --git a/algorithm.c b/algorithm.c
index 24165546..ad41a494 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -39,6 +39,7 @@
 #include "algorithm/credits.h"
 #include "algorithm/blake256.h"
 #include "algorithm/blakecoin.h"
+#include "algorithm/decred.h"
 
 #include "compat.h"
 
@@ -70,6 +71,7 @@ const char *algorithm_type_str[] = {
   "Yescrypt-multi",
   "Blakecoin",
   "Blake",
+  "Decred",
   "Vanilla"
 };
 
@@ -138,6 +140,17 @@ static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, s
   strcat(data->binary_filename, buf);
 }
 
+static void append_blake256_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
+{
+  char buf[255];
+  sprintf(buf, " -D LOOKUP_GAP=%d -D MAX_GLOBAL_THREADS=%lu ",
+    cgpu->lookup_gap, (unsigned long)cgpu->thread_concurrency);
+  strcat(data->compiler_options, buf);
+
+  sprintf(buf, "tc%lu", (unsigned long)cgpu->thread_concurrency);
+  strcat(data->binary_filename, buf);
+}
+
 static void append_x11_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
   char buf[255];
@@ -935,30 +948,63 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un
 
 static cl_int queue_blake_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
-	cl_kernel *kernel = &clState->kernel;
-	unsigned int num = 0;
-	cl_int status = 0;
-	cl_ulong le_target;
-
-	le_target = *(cl_ulong *)(blk->work->device_target + 24);
-	flip80(clState->cldata, blk->work->data);
-	status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
-
-	CL_SET_ARG(clState->outputBuffer);
-	CL_SET_ARG(blk->work->blk.ctx_a);
-	CL_SET_ARG(blk->work->blk.ctx_b);
-	CL_SET_ARG(blk->work->blk.ctx_c);
-	CL_SET_ARG(blk->work->blk.ctx_d);
-	CL_SET_ARG(blk->work->blk.ctx_e);
-	CL_SET_ARG(blk->work->blk.ctx_f);
-	CL_SET_ARG(blk->work->blk.ctx_g);
-	CL_SET_ARG(blk->work->blk.ctx_h);
-
-	CL_SET_ARG(blk->work->blk.cty_a);
-	CL_SET_ARG(blk->work->blk.cty_b);
-	CL_SET_ARG(blk->work->blk.cty_c);
-
-	return status;
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_int status = 0;
+  cl_ulong le_target;
+
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(blk->work->blk.ctx_a);
+  CL_SET_ARG(blk->work->blk.ctx_b);
+  CL_SET_ARG(blk->work->blk.ctx_c);
+  CL_SET_ARG(blk->work->blk.ctx_d);
+  CL_SET_ARG(blk->work->blk.ctx_e);
+  CL_SET_ARG(blk->work->blk.ctx_f);
+  CL_SET_ARG(blk->work->blk.ctx_g);
+  CL_SET_ARG(blk->work->blk.ctx_h);
+
+  CL_SET_ARG(blk->work->blk.cty_a);
+  CL_SET_ARG(blk->work->blk.cty_b);
+  CL_SET_ARG(blk->work->blk.cty_c);
+
+  return status;
+}
+
+static cl_int queue_decred_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_int status = 0;
+
+  CL_SET_ARG(clState->outputBuffer);
+  /* Midstate */
+  CL_SET_BLKARG(ctx_a);
+  CL_SET_BLKARG(ctx_b);
+  CL_SET_BLKARG(ctx_c);
+  CL_SET_BLKARG(ctx_d);
+  CL_SET_BLKARG(ctx_e);
+  CL_SET_BLKARG(ctx_f);
+  CL_SET_BLKARG(ctx_g);
+  CL_SET_BLKARG(ctx_h);
+  /* Last 52 bytes of data (without nonce) */
+  CL_SET_BLKARG(cty_a);
+  CL_SET_BLKARG(cty_b);
+  CL_SET_BLKARG(cty_c);
+  CL_SET_BLKARG(cty_d);
+  CL_SET_BLKARG(cty_e);
+  CL_SET_BLKARG(cty_f);
+  CL_SET_BLKARG(cty_g);
+  CL_SET_BLKARG(cty_h);
+  CL_SET_BLKARG(cty_i);
+  CL_SET_BLKARG(cty_j);
+  CL_SET_BLKARG(cty_k);
+  CL_SET_BLKARG(cty_l);
+
+  return status;
 }
 
 static algorithm_settings_t algos[] = {
@@ -989,6 +1035,11 @@ static algorithm_settings_t algos[] = {
   A_CREDITS("credits"),
 #undef A_CREDITS
 
+#define A_DECRED(a) \
+  { a, ALGO_DECRED, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, decred_regenhash, decred_midstate, decred_prepare_work, queue_decred_kernel, gen_hash, append_blake256_compiler_options }
+  A_DECRED("decred"),
+#undef A_DECRED
+
 #define A_YESCRYPT(a) \
   { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, NULL, NULL, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
   A_YESCRYPT("yescrypt"),
diff --git a/algorithm.h b/algorithm.h
index fc116a7d..5628d006 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -36,6 +36,7 @@ typedef enum {
   ALGO_YESCRYPT_MULTI,
   ALGO_BLAKECOIN,
   ALGO_BLAKE,
+  ALGO_DECRED,
   ALGO_VANILLA
 } algorithm_type_t;
 
diff --git a/algorithm/decred.c b/algorithm/decred.c
new file mode 100644
index 00000000..dd1654a7
--- /dev/null
+++ b/algorithm/decred.c
@@ -0,0 +1,170 @@
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ *
+ * Modified for more speed by BlueDragon747 for the Blakecoin project
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include <stdint.h>
+
+#include "sph/sph_blake.h"
+#include "algorithm/decred.h"
+
+static const uint32_t diff1targ_decred = 0x000000ff;
+
+void decredhash(void *state, const void *input)
+{
+  sph_blake256_context ctx_blake;
+  sph_blake256_init(&ctx_blake);
+  sph_blake256(&ctx_blake, input, 180);
+  sph_blake256_close(&ctx_blake, state);
+}
+
+void decred_midstate(struct work *work)
+{
+  sph_blake256_context     ctx_blake;
+  sph_blake256_init(&ctx_blake);
+  sph_blake256 (&ctx_blake, (unsigned char *)work->data, 128);
+
+  memcpy(work->midstate, ctx_blake.H, 32);
+  endian_flip32(work->midstate, work->midstate);
+
+  char *strdata, *strmidstate;
+  strdata = bin2hex(work->data, 128);
+  strmidstate = bin2hex(work->midstate, 32);
+  applog(LOG_DEBUG, "data %s midstate %s", strdata, strmidstate);
+}
+
+void decred_prepare_work(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata)
+{
+	blk->ctx_a = state[0];
+	blk->ctx_b = state[1];
+	blk->ctx_c = state[2];
+	blk->ctx_d = state[3];
+	blk->ctx_e = state[4];
+	blk->ctx_f = state[5];
+	blk->ctx_g = state[6];
+	blk->ctx_h = state[7];
+
+	blk->cty_a = htobe32(pdata[32]);
+	blk->cty_b = htobe32(pdata[33]);
+	blk->cty_c = htobe32(pdata[34]);
+	/* blk->cty_d = htobe32(pdata[35] = nonce) */
+
+	blk->cty_d = htobe32(pdata[36]);
+	blk->cty_e = htobe32(pdata[37]);
+	blk->cty_f = htobe32(pdata[38]);
+	blk->cty_g = htobe32(pdata[39]);
+
+	blk->cty_h = htobe32(pdata[40]);
+	blk->cty_i = htobe32(pdata[41]);
+	blk->cty_j = htobe32(pdata[42]);
+	blk->cty_k = htobe32(pdata[43]);
+
+	blk->cty_l = htobe32(pdata[44]);
+}
+
+
+static const uint32_t diff1targ = 0x0000ffff;
+
+/* Used externally as confirmation of correct OCL code */
+int decred_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	uint32_t data[45], ohash[8];
+
+	memcpy(data, pdata, 180);
+	data[35] = htobe32(nonce);
+	decredhash(ohash, data);
+	tmp_hash7 = be32toh(ohash[7]);
+
+	applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
+				(long unsigned int)Htarg,
+				(long unsigned int)diff1targ,
+				(long unsigned int)tmp_hash7);
+	if (tmp_hash7 > diff1targ)
+		return -1;
+	if (tmp_hash7 > Htarg)
+		return 0;
+	return 1;
+}
+
+void decred_regenhash(struct work *work)
+{
+        uint32_t data[45];
+        uint32_t *nonce = (uint32_t *)(work->data + 140);
+        uint32_t *ohash = (uint32_t *)(work->hash);
+
+        memcpy(data, work->data, 180);
+        data[35] = htobe32(*nonce);
+        decredhash(ohash, data);
+}
+
+bool scanhash_decred(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 140);
+	uint32_t data[45];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
+	bool ret = false;
+
+	memcpy(data, pdata, 180);
+
+	while(1) {
+		uint32_t ostate[8];
+
+		*nonce = ++n;
+		data[35] = (n);
+		decredhash(ostate, data);
+		tmp_hash7 = (ostate[7]);
+
+		applog(LOG_INFO, "data7 %08lx",
+					(long unsigned int)data[7]);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[35] = htobe32(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/algorithm/decred.h b/algorithm/decred.h
new file mode 100644
index 00000000..43045fa4
--- /dev/null
+++ b/algorithm/decred.h
@@ -0,0 +1,11 @@
+#ifndef DECRED_H
+#define DECRED_H
+
+#include "miner.h"
+
+extern int decred_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+extern void decred_prepare_work(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata);
+extern void decred_midstate(struct work *work);
+extern void decred_regenhash(struct work *work);
+
+#endif /* DECRED_H */
\ No newline at end of file
diff --git a/kernel/decred.cl b/kernel/decred.cl
new file mode 100644
index 00000000..0cdf30b2
--- /dev/null
+++ b/kernel/decred.cl
@@ -0,0 +1,161 @@
+/**
+ * BLAKE256 14-round kernel
+ *
+ * Copyright 2015 Company Zero
+ * A complete kernel re-write
+ * with inspiration from the Golang BLAKE256 repo (github.com/dchest/blake256)
+ */
+
+/**
+ * optimized by tpruvot 02/2016 :
+ *
+ * GTX 960 | (5s):735.3M (avg):789.3Mh/s
+ * GTX 750 | (5s):443.3M (avg):476.8Mh/s
+ * to
+ * GTX 960 | (5s):875.0M (avg):899.2Mh/s
+ * GTX 750 | (5s):523.1M (avg):536.8Mh/s
+ */
+#define ROTR(v,n) rotate(v,(uint)(32U-n))
+#define ROTL(v,n) rotate(v, n)
+
+#ifdef _AMD_OPENCL
+#define SWAP(v)   rotate(v, 16U)
+#define ROTR8(v)  rotate(v, 24U)
+#else
+#define SWAP(v)  as_uint(as_uchar4(v).zwxy)
+#define ROTR8(v) as_uint(as_uchar4(v).yzwx)
+#endif
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	volatile __global uint * restrict output,
+	// Midstate
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+
+	// last 52 bytes of data
+	const uint M0,
+	const uint M1,
+	const uint M2,
+	// const uint M3 : nonce
+	const uint M4,
+	const uint M5,
+	const uint M6,
+	const uint M7,
+	const uint M8,
+	const uint M9,
+	const uint MA,
+	const uint MB,
+	const uint MC
+)
+{
+	/* Load the block header and padding */
+	const uint M3 = get_global_id(0);
+	const uint MD = 0x80000001UL;
+	const uint ME = 0x00000000UL;
+	const uint MF = 0x000005a0UL;
+
+	const uint cst0 = 0x243F6A88UL;
+	const uint cst1 = 0x85A308D3UL;
+	const uint cst2 = 0x13198A2EUL;
+	const uint cst3 = 0x03707344UL;
+	const uint cst4 = 0xA4093822UL;
+	const uint cst5 = 0x299F31D0UL;
+	const uint cst6 = 0x082EFA98UL;
+	const uint cst7 = 0xEC4E6C89UL;
+	const uint cst8 = 0x452821E6UL;
+	const uint cst9 = 0x38D01377UL;
+	const uint cstA = 0xBE5466CFUL;
+	const uint cstB = 0x34E90C6CUL;
+	const uint cstC = 0xC0AC29B7UL;
+	const uint cstD = 0xC97C50DDUL;
+	const uint cstE = 0x3F84D5B5UL;
+	const uint cstF = 0xB5470917UL;
+
+	uint V0, V1, V2, V3, V4, V5, V6, V7;
+	uint V8, V9, VA, VB, VC, VD, VE, VF;
+	uint pre7;
+
+	/* Load the midstate and initialize */
+	V0 = h0;
+	V1 = h1;
+	V2 = h2;
+	V3 = h3;
+	V4 = h4;
+	V5 = h5;
+	V6 = h6;
+	pre7 = V7 = h7;
+
+	V8 = cst0;
+	V9 = cst1;
+	VA = cst2;
+	VB = cst3;
+	VC = 0xA4093D82UL;
+	VD = 0x299F3470UL;
+	VE = cst6;
+	VF = cst7;
+
+	/* 14 rounds */
+
+	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M9 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M2 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MA ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M4 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MF ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M0 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (ME ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MB ^ cstC); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M3 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MD ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MC ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M1 ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M2 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M6 ^ cstA); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M0 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M8 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MB ^ cst0); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M3 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MA ^ cst6); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M4 ^ cstD); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MF ^ cstE); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M1 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (ME ^ cstF); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M9 ^ cst1); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MD ^ cst4); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MC ^ cst5); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M1 ^ cstF); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (ME ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M4 ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MD ^ cstE); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MA ^ cst4); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MF ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M5 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M0 ^ cst7); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M9 ^ cst2); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M8 ^ cstB); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M2 ^ cst9); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MB ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M7 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MD ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M7 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MC ^ cst1); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M3 ^ cst9); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M1 ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M9 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (ME ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MB ^ cstD); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M5 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MF ^ cst4); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M2 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MA ^ cst2); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M4 ^ cstF); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M0 ^ cst5); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M6 ^ cstF); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (ME ^ cst9); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MB ^ cst3); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M0 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M3 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M8 ^ cst0); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MF ^ cst6); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MD ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M1 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MA ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M4 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M5 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M7 ^ cstD); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M2 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MA ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M7 ^ cst6); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M1 ^ cst5); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M6 ^ cst7); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M5 ^ cst1); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M2 ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MF ^ cstB); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M3 ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MD ^ cst0); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MC ^ cst3); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M0 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (ME ^ cst9); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MB ^ cstF); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF);/*VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);*/
+
+	/* The final chunks of the hash
+	 * are calculated as:
+	 * h0 = h0 ^ V0 ^ V8;
+	 * h1 = h1 ^ V1 ^ V9;
+	 * h2 = h2 ^ V2 ^ VA;
+	 * h3 = h3 ^ V3 ^ VB;
+	 * h4 = h4 ^ V4 ^ VC;
+	 * h5 = h5 ^ V5 ^ VD;
+	 * h6 = h6 ^ V6 ^ VE;
+	 * h7 = h7 ^ V7 ^ VF;
+	 *
+	 * We just check if the last byte
+	 * is zeroed and if it is, we tell
+	 * cgminer that we've found a
+	 * and to check it against the
+	 * target.
+	*/
+
+	/* Debug code to help you assess the correctness
+	 * of your hashing function in case someone decides
+	 * to try to optimize.
+	if (!((pre7 ^ V7 ^ VF) & 0xFFFF0000)) {
+		printf("hash on gpu %x %x %x %x %x %x %x %x\n",
+			h0 ^ V0 ^ V8,
+			h1 ^ V1 ^ V9,
+			h2 ^ V2 ^ VA,
+			h3 ^ V3 ^ VB,
+			h4 ^ V4 ^ VC,
+			h5 ^ V5 ^ VD,
+			h6 ^ V6 ^ VE,
+			h7 ^ V7 ^ VF);
+		printf("nonce for hash on gpu %x\n",
+			nonce);
+	}
+	*/
+
+	if (pre7 ^ V7 ^ VF) return;
+
+	/* Push this share */
+	output[output[0xFF]++] = M3;
+}
diff --git a/miner.h b/miner.h
index 6d08f246..3d6ce429 100644
--- a/miner.h
+++ b/miner.h
@@ -736,6 +736,16 @@ static inline void flip168(void *dest_p, const void *src_p)
 		dest[i] = swab32(src[i]);
 }
 
+static inline void flip180(void *dest_p, const void *src_p)
+{
+	uint32_t *dest = (uint32_t *)dest_p;
+	const uint32_t *src = (uint32_t *)src_p;
+	int i;
+
+	for (i = 0; i < 45; i++)
+		dest[i] = swab32(src[i]);
+}
+
 /*
  * Encode a length len/4 vector of (uint32_t) into a length len vector of
  * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
@@ -762,7 +772,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p)
 }
 static inline void endian_flip168(void *dest_p, const void *src_p)
 {
-	flip168(dest_p, src_p);
+  flip168(dest_p, src_p);
+}
+static inline void endian_flip180(void *dest_p, const void *src_p)
+{
+  flip180(dest_p, src_p);
 }
 
 #else
@@ -779,6 +793,10 @@ static inline void
 endian_flip168(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
 {
 }
+static inline void
+endian_flip180(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
+{
+}
 #endif
 
 
@@ -1232,6 +1250,8 @@ typedef struct _dev_blk_ctx {
   cl_uint ctx_e; cl_uint ctx_f; cl_uint ctx_g; cl_uint ctx_h;
   cl_uint cty_a; cl_uint cty_b; cl_uint cty_c; cl_uint cty_d;
   cl_uint cty_e; cl_uint cty_f; cl_uint cty_g; cl_uint cty_h;
+  cl_uint cty_i; cl_uint cty_j; cl_uint cty_k; cl_uint cty_l;
+  cl_uint cty_m; cl_uint cty_n; cl_uint cty_o; cl_uint cty_p;
   cl_uint merkle; cl_uint ntime; cl_uint nbits; cl_uint nonce;
   cl_uint fW0; cl_uint fW1; cl_uint fW2; cl_uint fW3; cl_uint fW15;
   cl_uint fW01r; cl_uint fcty_e; cl_uint fcty_e2;
@@ -1442,7 +1462,7 @@ struct pool {
 #define GETWORK_MODE_GBT 'G'
 
 struct work {
-  unsigned char data[168];
+  unsigned char data[256];
   unsigned char midstate[32];
   unsigned char target[32];
   unsigned char hash[32];
diff --git a/ocl.c b/ocl.c
index ecc383c8..c34f191b 100644
--- a/ocl.c
+++ b/ocl.c
@@ -758,7 +758,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
   size_t buf1size;
   size_t buf3size;
   size_t buf2size;
-  size_t readbufsize = (algorithm->type == ALGO_CRE) ? 168 : 128;
+  size_t readbufsize = 128;
+  if (algorithm->type == ALGO_CRE) readbufsize = 168;
+  else if (algorithm->type == ALGO_DECRED) readbufsize = 192;
 
   if (algorithm->rw_buffer_size < 0) {
     // calc buffer size for neoscrypt
diff --git a/ocl.h b/ocl.h
index 5d423249..311db29e 100644
--- a/ocl.h
+++ b/ocl.h
@@ -17,7 +17,7 @@ typedef struct __clState {
   cl_mem buffer1;
   cl_mem buffer2;
   cl_mem buffer3;
-  unsigned char cldata[168];
+  unsigned char cldata[256];
   bool goffset;
   cl_uint vwidth;
   size_t max_work_size;
diff --git a/sgminer.c b/sgminer.c
index f317da7b..a208f510 100644
--- a/sgminer.c
+++ b/sgminer.c
@@ -2064,17 +2064,31 @@ static void update_gbt(struct pool *pool)
 /* Return the work coin/network difficulty */
 static double get_work_blockdiff(const struct work *work)
 {
+  uint32_t* data = (uint32_t*) work->data;
   uint64_t diff64;
   double numerator;
+  int powdiff;
+  uint8_t shift;
 
   // Neoscrypt has the data reversed
   if (work->pool->algorithm.type == ALGO_NEOSCRYPT) {
     diff64 = bswap_64(((uint64_t)(be32toh(*((uint32_t *)(work->data + 72))) & 0xFFFFFF00)) << 8);
     numerator = (double)work->pool->algorithm.diff_numerator;
   }
+  else if (work->pool->algorithm.type == ALGO_DECRED) {
+    shift = work->data[116+3];
+    powdiff = (8 * (0x1d - 3)) - (8 * (shift - 3));
+    diff64 = data[29] & 0xFFFFFF;
+    if (!diff64) diff64 = 1;
+    double d = (double)work->pool->algorithm.diff_numerator / (double)diff64;
+    for (int m = shift; m < 29; m++) d *= 256.0;
+    for (int m = 29; m < shift; m++) d /= 256.0;
+    if (shift == 28) d *= 256.0; // testnet
+    return d;
+  }
   else {
-    uint8_t pow = work->data[72];
-    int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));;
+    shift = work->data[72];
+    powdiff = (8 * (0x1d - 3)) - (8 * (shift - 3));;
     diff64 = be32toh(*((uint32_t *)(work->data + 72))) & 0x0000000000FFFFFF;
     numerator = work->pool->algorithm.diff_numerator << powdiff;
   }
@@ -2245,15 +2259,15 @@ static bool gbt_decode(struct pool *pool, json_t *res_val)
 static bool getwork_decode(json_t *res_val, struct work *work)
 {
   size_t worklen = 128;
-  worklen = ((work->pool->algorithm.type == ALGO_CRE) ? sizeof(work->data) : worklen);
+  if (work->pool->algorithm.type == ALGO_CRE) worklen = 168;
+  else if (work->pool->algorithm.type == ALGO_DECRED) worklen = 192;
   if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) {
     if (opt_morenotices)
       applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), ""));
     return false;
   }
 
-  // Neoscrypt doesn't calc midstate
-  if (work->pool->algorithm.type != ALGO_NEOSCRYPT) {
+  if (work->pool->algorithm.type == ALGO_CRE || work->pool->algorithm.type == ALGO_SCRYPT) {
     if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) {
       // Calculate it ourselves
       if (opt_morenotices) {
@@ -2268,6 +2282,11 @@ static bool getwork_decode(json_t *res_val, struct work *work)
       applog(LOG_ERR, "%s: JSON inval target", isnull(get_pool_name(work->pool), ""));
     return false;
   }
+  if (work->pool->algorithm.type == ALGO_DECRED) {
+    // some random extradata to make it unique
+    ((uint32_t*)work->data)[36] = (rand()*4);
+    ((uint32_t*)work->data)[37] = (rand()*4) << 8 | work->thr_id;
+  }
   return true;
 }
 
@@ -3004,7 +3023,9 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
 
   cgpu = get_thr_cgpu(thr_id);
 
-  if (work->pool->algorithm.type == ALGO_CRE) {
+  if (work->pool->algorithm.type == ALGO_DECRED) {
+    endian_flip180(work->data, work->data);
+  } else if (work->pool->algorithm.type == ALGO_CRE) {
     endian_flip168(work->data, work->data);
   } else {
     endian_flip128(work->data, work->data);
@@ -3014,6 +3035,13 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
   int datasize = 128;
   if (work->pool->algorithm.type == ALGO_NEOSCRYPT) datasize = 80;
   else if (work->pool->algorithm.type == ALGO_CRE) datasize = 168;
+  else if (work->pool->algorithm.type == ALGO_DECRED) {
+    datasize = 192;
+    ((uint32_t*)work->data)[45] = 0x80000001UL;
+    ((uint32_t*)work->data)[46] = 0;
+    ((uint32_t*)work->data)[47] = 0x000005a0UL;
+  }
+
   hexstr = bin2hex(work->data, datasize);
 
   /* build JSON-RPC request */
@@ -3670,15 +3698,44 @@ static inline bool can_roll(struct work *work)
     work->rolls < 7000 && !stale_work(work, false));
 }
 
+static uint32_t _get_work_time(struct work *work)
+{
+  uint32_t *data = (uint32_t*) work->data;
+  uint32_t work_ntime = data[17];
+  if (work->pool && work->pool->algorithm.type == ALGO_DECRED) {
+    work_ntime = data[34];
+  }
+  return work_ntime;
+}
+
+static void _set_work_time(struct work *work, uint32_t ntime)
+{
+  uint32_t *data = (uint32_t*) work->data;
+  uint32_t *work_ntime = &data[17];
+  if (work->pool && work->pool->algorithm.type == ALGO_DECRED) {
+    work_ntime = &data[34];
+  }
+  (*work_ntime) = ntime;
+}
+
 static void roll_work(struct work *work)
 {
-  uint32_t *work_ntime;
+  uint32_t work_ntime;
   uint32_t ntime;
 
-  work_ntime = (uint32_t *)(work->data + 68);
-  ntime = be32toh(*work_ntime);
+  work_ntime = _get_work_time(work);
+  ntime = be32toh(work_ntime);
   ntime++;
-  *work_ntime = htobe32(ntime);
+
+  if (work->pool->algorithm.type == ALGO_DECRED) {
+    uint32_t* data = (uint32_t*) work->data;
+    // dont mess with ntime, use extranonce
+    data[36]++;
+    data[37] = ((rand()*4) << 8) | work->thr_id;
+  } else {
+    _set_work_time(work, htobe32(ntime));
+  }
+
   local_work++;
   work->rolls++;
   work->blk.nonce = 0;
@@ -3736,6 +3793,12 @@ static struct work *make_clone(struct work *work)
 {
   struct work *work_clone = copy_work(work);
 
+  if (work->pool->algorithm.type == ALGO_DECRED) {
+    // maybe not useful here
+    ((uint32_t*)work->data)[36] = (rand()*4);
+    ((uint32_t*)work->data)[37] = (rand()*4) << 8;
+  }
+
   work_clone->clone = true;
   cgtime((struct timeval *)&(work_clone->tv_cloned));
   work_clone->longpoll = false;
@@ -3854,20 +3917,18 @@ static void _copy_work(struct work *work, const struct work *base_work, int noff
     /* If we are passed an noffset the binary work->data ntime and
      * the work->ntime hex string need to be adjusted. */
     if (noffset) {
-      uint32_t *work_ntime = (uint32_t *)(work->data + 68);
-      uint32_t ntime = be32toh(*work_ntime);
-
+      uint32_t work_ntime = _get_work_time(work);
+      uint32_t ntime = be32toh(work_ntime);
       ntime += noffset;
-      *work_ntime = htobe32(ntime);
+      _set_work_time(work, htobe32(ntime));
       work->ntime = offset_ntime(base_work->ntime, noffset);
     } else
       work->ntime = strdup(base_work->ntime);
   } else if (noffset) {
-    uint32_t *work_ntime = (uint32_t *)(work->data + 68);
-    uint32_t ntime = be32toh(*work_ntime);
-
+    uint32_t work_ntime = _get_work_time(work);
+    uint32_t ntime = be32toh(work_ntime);
     ntime += noffset;
-    *work_ntime = htobe32(ntime);
+    _set_work_time(work, htobe32(ntime));
   }
   if (base_work->coinbase)
     work->coinbase = strdup(base_work->coinbase);
@@ -5522,11 +5583,10 @@ static void *stratum_sthread(void *userdata)
     quit(1, "Failed to create stratum_q in stratum_sthread");
 
   while (42) {
-    char noncehex[12], nonce2hex[20], s[1024];
+    char noncehex[12], nonce2hex[33], s[1024] = { 0 };
     struct stratum_share *sshare;
     uint32_t *hash32, nonce;
-    unsigned char nonce2[8];
-    uint64_t *nonce2_64;
+    unsigned char nonce2[16];
     struct work *work;
     bool submitted;
 
@@ -5538,7 +5598,7 @@ static void *stratum_sthread(void *userdata)
     if (unlikely(!work))
       quit(1, "Stratum q returned empty work");
 
-    if (unlikely(work->nonce2_len > 8)) {
+    if ((pool->algorithm.type != ALGO_DECRED && unlikely(work->nonce2_len > 8)) || (pool->algorithm.type == ALGO_DECRED && unlikely(work->nonce2_len > 16))) {
       applog(LOG_ERR, "%s asking for inappropriately long nonce2 length %d", get_pool_name(pool), (int)work->nonce2_len);
       applog(LOG_ERR, "Not attempting to submit shares");
       free_work(work);
@@ -5561,12 +5621,19 @@ static void *stratum_sthread(void *userdata)
       nonce = htobe32(*((uint32_t *)(work->data + 76)));
       //*((uint32_t *)nonce2) = htole32(work->nonce2);
     }
+    else if (pool->algorithm.type == ALGO_DECRED) {
+      nonce = *((uint32_t *)(work->data + 140));
+    }
     else {
       nonce = *((uint32_t *)(work->data + 76));
     }
     __bin2hex(noncehex, (const unsigned char *)&nonce, 4);
 
-    *((uint64_t *)nonce2) = htole64(work->nonce2);
+    if (pool->algorithm.type == ALGO_DECRED) {
+      memcpy(nonce2, work->data + 144, work->nonce2_len);
+    } else {
+      *((uint64_t *)nonce2) = htole64(work->nonce2);
+    }
     __bin2hex(nonce2hex, nonce2, work->nonce2_len);
     memset(s, 0, 1024);
 
@@ -6033,29 +6100,38 @@ void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id)
 static void gen_stratum_work(struct pool *pool, struct work *work)
 {
   unsigned char merkle_root[32], merkle_sha[64];
+  int datasize = 128;
   uint32_t *data32, *swap32;
   uint64_t nonce2le;
   int i, j;
 
   cg_wlock(&pool->data_lock);
 
-  /* Update coinbase. Always use an LE encoded nonce2 to fill in values
-  * from left to right and prevent overflow errors with small n2sizes */
-  nonce2le = htole64(pool->nonce2);
-  memcpy(pool->coinbase + pool->nonce2_offset, &nonce2le, pool->n2size);
-  work->nonce2 = pool->nonce2++;
-  work->nonce2_len = pool->n2size;
+  if (pool->algorithm.type == ALGO_DECRED) {
+    datasize = 180;
+    work->nonce2 = pool->nonce2++;
+    work->nonce2_len = pool->n2size;
+  } else {
+    /* Update coinbase. Always use an LE encoded nonce2 to fill in values
+    * from left to right and prevent overflow errors with small n2sizes */
+    nonce2le = htole64(pool->nonce2);
+    memcpy(pool->coinbase + pool->nonce2_offset, &nonce2le, pool->n2size);
+    work->nonce2 = pool->nonce2++;
+    work->nonce2_len = pool->n2size;
+  }
 
   /* Downgrade to a read lock to read off the pool variables */
   cg_dwlock(&pool->data_lock);
 
-  /* Generate merkle root */
-  pool->algorithm.gen_hash(pool->coinbase, pool->swork.cb_len, merkle_root);
-  memcpy(merkle_sha, merkle_root, 32);
-  for (i = 0; i < pool->swork.merkles; i++) {
-    memcpy(merkle_sha + 32, pool->swork.merkle_bin[i], 32);
-    gen_hash(merkle_sha, 64, merkle_root);
+  if (pool->algorithm.type != ALGO_DECRED) {
+    /* Generate merkle root */
+    pool->algorithm.gen_hash(pool->coinbase, pool->swork.cb_len, merkle_root);
     memcpy(merkle_sha, merkle_root, 32);
+    for (i = 0; i < pool->swork.merkles; i++) {
+      memcpy(merkle_sha + 32, pool->swork.merkle_bin[i], 32);
+      gen_hash(merkle_sha, 64, merkle_root);
+      memcpy(merkle_sha, merkle_root, 32);
+    }
   }
 
   applog(LOG_DEBUG, "[THR%d] gen_stratum_work() - algorithm = %s", work->thr_id, pool->algorithm.name);
@@ -6089,6 +6165,18 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
     ((uint32_t *)work->data)[20] = 0x80000000;
     ((uint32_t *)work->data)[31] = 0x00000280;
   }
+  else if (pool->algorithm.type == ALGO_DECRED) {
+    memcpy(work->data, pool->header_bin, 4); // version
+    flip32(work->data + 4, pool->header_bin + 4); // prevhash
+    memcpy(work->data + 4 + 32, pool->coinbase, MIN((int)pool->swork.cb_len, 108));
+    memcpy(work->data + 144, pool->nonce1bin, MIN(pool->n1_len, 36));
+    ((uint32_t *)work->data)[36] = work->nonce2;
+//    ((uint32_t *)work->data)[36] = 2;
+    ((uint32_t *)work->data)[37] = ((rand() * 4) << 8) | work->thr_id;
+//    ((uint32_t *)work->data)[37] = 0x0000a400;
+    for (i = 39; i < 45; i++)
+      ((uint32_t *)work->data)[i] = 0;
+  }
   else {
     data32 = (uint32_t *)merkle_sha;
     swap32 = (uint32_t *)merkle_root;
@@ -6112,14 +6200,16 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
   if (opt_debug) {
     char *header, *merkle_hash;
 
-    header = bin2hex(work->data, 128);
-    merkle_hash = bin2hex((const unsigned char *)merkle_root, 32);
-    applog(LOG_DEBUG, "[THR%d] Generated stratum merkle %s", work->thr_id, merkle_hash);
+    header = bin2hex(work->data, datasize);
+    if (pool->algorithm.type != ALGO_DECRED) {
+        merkle_hash = bin2hex((const unsigned char *)merkle_root, 32);
+        applog(LOG_DEBUG, "[THR%d] Generated stratum merkle %s", work->thr_id, merkle_hash);
+        free(merkle_hash);
+    }
     applog(LOG_DEBUG, "[THR%d] Generated stratum header %s", work->thr_id, header);
     applog(LOG_DEBUG, "[THR%d] Work job_id %s nonce2 %"PRIu64" ntime %s", work->thr_id, work->job_id,
            work->nonce2, work->ntime);
     free(header);
-    free(merkle_hash);
   }
 
   // For Neoscrypt use set_target_neoscrypt() function
@@ -7055,6 +7145,7 @@ static void rebuild_nonce(struct work *work, uint32_t nonce)
 {
   uint32_t nonce_pos = 76;
   if (work->pool->algorithm.type == ALGO_CRE) nonce_pos = 140;
+  else if (work->pool->algorithm.type == ALGO_DECRED) nonce_pos = 140;
 
   uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos);