pluck: adaptation from djm repo

remains the cpu validation check to do... throughput for this algo is divided by 128 to keep same kind of intensity values (default 18.0)
2015-03-08 14:51:40 +01:00 · 2015-03-08 14:51:40 +01:00 · 7939dce0aa
commit 7939dce0aa
parent 10d4f4a96a
9 changed files with 1164 additions and 5 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -46,6 +46,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
 			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
+			  pluck/pluck.cu pluck/cuda_pluck.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -97,6 +97,7 @@ enum sha_algos {
 	ALGO_MYR_GR,
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
+	ALGO_PLUCK,
 	ALGO_QUARK,
 	ALGO_QUBIT,
 	ALGO_S3,
@ -128,6 +129,7 @@ static const char *algo_names[] = {
 	"myr-gr",
 	"nist5",
 	"penta",
+	"pluck",
 	"quark",
 	"qubit",
 	"s3",
@ -237,6 +239,7 @@ Options:\n\
 			myr-gr      Myriad-Groestl\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
+			pluck       SupCoin\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
 			s3          S3 (1Coin)\n\
@ -1059,6 +1062,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)

 	switch (opt_algo) {
 		case ALGO_JACKPOT:
+		case ALGO_PLUCK:
 			diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
 		case ALGO_DMD_GR:
@ -1272,6 +1276,9 @@ static void *miner_thread(void *userdata)
 			case ALGO_LYRA2:
 				minmax = 0x100000;
 				break;
+			case ALGO_PLUCK:
+				minmax = 0x2000;
+				break;
 			}
 			max64 = max(minmax-1, max64);
 		}
@ -1397,6 +1404,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;

+		case ALGO_PLUCK:
+			rc = scanhash_pluck(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@ -2237,8 +2249,8 @@ int main(int argc, char *argv[])
 	printf("    Built with the nVidia CUDA SDK 6.5\n\n");
 #endif
 	printf("  Originally based on cudaminer by Christian Buchner and Christian H.,\n");
-	printf("  Include some of djm34 additions and sp optimisations\n");
-	printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
+	printf("  Include some work of djm34, sp, tsiv and klausT\n\n");
+	printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");

 	rpc_user = strdup("");
 	rpc_pass = strdup("");
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -313,6 +313,7 @@
    <ClInclude Include="cpuminer-config.h" />
    <ClInclude Include="cuda_groestlcoin.h" />
    <ClInclude Include="cuda_helper.h" />
+    <ClInclude Include="cuda_vector.h" />
    <ClInclude Include="elist.h" />
    <ClInclude Include="heavy\heavy.h" />
    <ClInclude Include="hefty1.h" />
@ -416,6 +417,8 @@
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
+    <CudaCompile Include="pluck\pluck.cu" />
+    <CudaCompile Include="pluck\cuda_pluck.cu" />
    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -308,6 +308,9 @@
    <ClInclude Include="cuda_helper.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
+    <ClInclude Include="cuda_vector.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
    <ClInclude Include="sph\sph_hamsi.h">
      <Filter>Header Files\sph</Filter>
    </ClInclude>
@ -394,6 +397,12 @@
    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
      <Filter>Source Files\CUDA\JHA</Filter>
    </CudaCompile>
+    <CudaCompile Include="pluck\pluck.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="pluck\cuda_pluck.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
    <CudaCompile Include="quark\cuda_jh512.cu">
      <Filter>Source Files\CUDA\quark</Filter>
    </CudaCompile>
@ -573,4 +582,4 @@
      <Filter>Ressources</Filter>
    </ResourceCompile>
  </ItemGroup>
-</Project>
+</Project>
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -57,11 +57,13 @@ extern const uint3 threadIdx;
 #endif

 #if __CUDA_ARCH__ < 320
-// Kepler (Compute 3.0)
+// Host and Compute 3.0
 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
-// Kepler (Compute 3.5, 5.0)
+// Compute 3.2+
 #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 #endif

 __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
@ -285,6 +287,45 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 	return result;
 }

+// device asm 32 for pluck
+__device__ __forceinline__
+uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+	asm("{ .reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+	asm("{ .reg .u32 t1;\n\t"
+		"xor.b32 t1, %2, %3;\n\t"
+		"xor.b32 %0, %1, t1;\n\t"
+		"}"
+		: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x,uint32_t n) {
+	uint32_t result;
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x,uint32_t n) {
+	uint32_t result;
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+	return result;
+}
+
 #ifndef USE_ROT_ASM_OPT
 #define USE_ROT_ASM_OPT 1
 #endif
--- a/cuda_vector.h
+++ b/cuda_vector.h
@ -0,0 +1,245 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+typedef struct __align__(32) uint8
+{
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) uint16
+{
+	union {
+		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
+		uint8 lo;
+	};
+	union {
+		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+		uint8 hi;
+	};
+} uint16;
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+
+static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
+{
+	uint8 test;
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
+{
+	uint64_t test;
+	asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d; \n\t"
+	"ld.global.nc.u8 a,[%1]; \n\t"
+	"ld.global.nc.u8 b,[%1+1]; \n\t"
+	"ld.global.nc.u8 c,[%1+2]; \n\t"
+	"ld.global.nc.u8 d,[%1+3]; \n\t"
+	"mov.b32 %0,{a,b,c,d}; }\n\t"
+		: "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
+{
+	uint64_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d,e,f,g,h; \n\t"
+		".reg .u32 i,j; \n\t"
+		"ld.global.nc.u8 a,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"ld.global.nc.u8 e,[%1+4]; \n\t"
+		"ld.global.nc.u8 f,[%1+5]; \n\t"
+		"ld.global.nc.u8 g,[%1+6]; \n\t"
+		"ld.global.nc.u8 h,[%1+7]; \n\t"
+		 "mov.b32 i,{a,b,c,d}; \n\t"
+         "mov.b32 j,{e,f,g,h}; \n\t"
+		 "mov.b64 %0,{i,j}; }\n\t"
+		: "=l"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
+{
+	uint32_t zero = 0;
+	uint64_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d; \n\t"
+		".reg .u32 i; \n\t"
+		"ld.global.nc.u8 a,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"mov.b32 i,{a,b,c,d}; \n\t"
+		"mov.b64 %0,{i,%1}; }\n\t"
+		: "=l"(test) : __LDG_PTR(ptr), "r"(zero));
+	return (test);
+}
+
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm("{\n\t"
+		".reg .u8 e,b,c,d; \n\t"
+		"ld.global.nc.u8 e,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"mov.b32 %0,{e,b,c,d}; }\n\t"
+		: "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	vec.s8 = cuda_swab32(buf[0].s8);
+	vec.s9 = cuda_swab32(buf[0].s9);
+	vec.sa = cuda_swab32(buf[0].sa);
+	vec.sb = cuda_swab32(buf[0].sb);
+	vec.sc = cuda_swab32(buf[0].sc);
+	vec.sd = cuda_swab32(buf[0].sd);
+	vec.se = cuda_swab32(buf[0].se);
+	vec.sf = cuda_swab32(buf[0].sf);
+	return vec;
+}
+#endif // #ifndef CUDA_VECTOR_H
--- a/miner.h
+++ b/miner.h
@ -334,6 +334,10 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_pluck(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_qubit(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
--- a/pluck/cuda_pluck.cu
+++ b/pluck/cuda_pluck.cu
@ -0,0 +1,574 @@
+/*
+ * "pluck" kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2015  djm34
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author djm34
+ * @author tpruvot
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+#include "cuda_vector.h"
+
+uint32_t *d_PlNonce[MAX_GPUS];
+
+__device__  uint8_t *  hashbuffer;
+__constant__  uint32_t pTarget[8];
+__constant__  uint32_t c_data[20];
+
+#define HASH_MEMORY_8bit 131072
+#define HASH_MEMORY_32bit 32768
+#define HASH_MEMORY 4096
+
+static __constant__  uint32_t H256[8] = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372,
+	0xA54FF53A, 0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+static  __constant__  uint32_t Ksha[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+	0x3956C25B, 0x59F111F1,	0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01,	0x243185BE, 0x550C7DC3,
+	0x72BE5D74, 0x80DEB1FE,	0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786,	0x0FC19DC6, 0x240CA1CC,
+	0x2DE92C6F, 0x4A7484AA,	0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D,	0xB00327C8, 0xBF597FC7,
+	0xC6E00BF3, 0xD5A79147,	0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138,	0x4D2C6DFC, 0x53380D13,
+	0x650A7354, 0x766A0ABB,	0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B,	0xC24B8B70, 0xC76C51A3,
+	0xD192E819, 0xD6990624,	0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08,	0x2748774C, 0x34B0BCB5,
+	0x391C0CB3, 0x4ED8AA4A,	0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F,	0x84C87814, 0x8CC70208,
+	0x90BEFFFA, 0xA4506CEB,	0xBEF9A3F7, 0xC67178F2
+};
+
+
+#define SALSA(a,b,c,d) { \
+    t = a+d; b^=rotate(t,  7); \
+    t = b+a; c^=rotate(t,  9); \
+    t = c+b; d^=rotate(t, 13); \
+    t = d+c; a^=rotate(t, 18); \
+}
+
+#define SALSA_CORE(state) { \
+	SALSA(state.s0,state.s4,state.s8,state.sc); \
+	SALSA(state.s5,state.s9,state.sd,state.s1); \
+	SALSA(state.sa,state.se,state.s2,state.s6); \
+	SALSA(state.sf,state.s3,state.s7,state.sb); \
+	SALSA(state.s0,state.s1,state.s2,state.s3); \
+	SALSA(state.s5,state.s6,state.s7,state.s4); \
+	SALSA(state.sa,state.sb,state.s8,state.s9); \
+	SALSA(state.sf,state.sc,state.sd,state.se); \
+}
+
+static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx)
+{
+	uint32_t t;
+	uint16 state = Bx;
+	SALSA_CORE(state);
+	SALSA_CORE(state);
+	SALSA_CORE(state);
+	SALSA_CORE(state);
+	return(state+Bx);
+}
+
+
+// sha256
+
+static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	uint32_t r1 = ROTR32(x, 2);
+	uint32_t r2 = ROTR32(x, 13);
+	uint32_t r3 = ROTR32(x, 22);
+	return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	uint32_t r1 = ROTR32(x, 6);
+	uint32_t r2 = ROTR32(x, 11);
+	uint32_t r3 = ROTR32(x, 25);
+	return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	uint64_t r1 = ROTR32(x, 7);
+	uint64_t r2 = ROTR32(x, 18);
+	uint64_t r3 = shr_t32(x, 3);
+	return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	uint64_t r1 = ROTR32(x, 17);
+	uint64_t r2 = ROTR32(x, 19);
+	uint64_t r3 = shr_t32(x, 10);
+	return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
+	const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1, t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a, b, c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
+	const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1, t2;
+
+	int pcidx1 = (pc - 2) & 0xF;
+	int pcidx2 = (pc - 7) & 0xF;
+	int pcidx3 = (pc - 15) & 0xF;
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a, b, c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+static __device__ __forceinline__
+void sha2_round_body(uint32_t* in, uint32_t* r)
+{
+	uint32_t a = r[0];
+	uint32_t b = r[1];
+	uint32_t c = r[2];
+	uint32_t d = r[3];
+	uint32_t e = r[4];
+	uint32_t f = r[5];
+	uint32_t g = r[6];
+	uint32_t h = r[7];
+
+	sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]);
+	sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]);
+	sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]);
+	sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]);
+	sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]);
+	sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]);
+	sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]);
+	sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]);
+	sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]);
+	sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]);
+	sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]);
+	sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]);
+	sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]);
+	sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]);
+	sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]);
+	sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]);
+
+	#pragma unroll 3
+	for (int i = 0; i<3; i++) {
+
+		sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]);
+		sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]);
+		sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]);
+		sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]);
+		sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]);
+		sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]);
+		sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]);
+		sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]);
+		sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]);
+		sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]);
+		sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]);
+		sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]);
+		sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]);
+		sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]);
+		sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]);
+		sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]);
+
+	}
+
+	r[0] += a;
+	r[1] += b;
+	r[2] += c;
+	r[3] += d;
+	r[4] += e;
+	r[5] += f;
+	r[6] += g;
+	r[7] += h;
+}
+
+
+static __device__ __forceinline__ uint8 sha256_64(uint32_t *data)
+{
+	uint32_t __align__(64) in[16];
+	uint32_t __align__(32) buf[8];
+
+	((uint16 *)in)[0] = swapvec((uint16*)data);
+
+	((uint8*)buf)[0] = ((uint8*)H256)[0];
+
+	sha2_round_body(in, buf);
+
+	#pragma unroll 14
+	for (int i = 0; i<14; i++) { in[i + 1] = 0; }
+
+	in[0] = 0x80000000;
+	in[15] = 0x200;
+
+	sha2_round_body(in, buf);
+	return swapvec((uint8*)buf);
+}
+
+static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce)
+{
+//	uint32_t in[16], buf[8];
+	uint32_t __align__(64) in[16];
+	uint32_t __align__(32) buf[8];
+
+	((uint16 *)in)[0] = swapvec((uint16*)c_data);
+	((uint8*)buf)[0] = ((uint8*)H256)[0];
+
+	sha2_round_body(in, buf);
+
+	#pragma unroll 3
+	for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); }
+
+//	in[3] = cuda_swab32(nonce);
+    in[3] = nonce;
+	in[4] = 0x80000000;
+	in[15] = 0x280;
+
+	#pragma unroll
+	for (int i = 5; i<15; i++) { in[i] = 0; }
+
+	sha2_round_body(in, buf);
+	return swapvec((uint8*)buf);
+}
+
+#define SHIFT 32 * 1024 * 4
+
+__global__ __launch_bounds__(256, 1)
+void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t shift = SHIFT * thread;
+		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
+		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
+		for (int i = 2; i < 5; i++)
+		{
+			uint32_t randmax = i * 32 - 4;
+			uint32_t randseed[16];
+			uint32_t randbuffer[16];
+			uint32_t joint[16];
+			uint8 Buffbuffer[2];
+
+			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+//			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+			((uint8*)joint)[0] = ((uint8*)randseed)[1];
+
+			#pragma unroll
+			for (int j = 0; j < 8; j++) {
+				uint32_t rand = randbuffer[j] % (randmax - 32);
+				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+			}
+
+			uint8 truc = sha256_64(joint);
+			((uint8*)(hashbuffer + shift))[i] = truc;
+			((uint8*)randseed)[0] = ((uint8*)joint)[0];
+			((uint8*)randseed)[1] = truc;
+
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+			for (int j = 0; j < 32; j += 2)
+			{
+				uint32_t rand = randbuffer[j / 2] % randmax;
+				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+			}
+
+		} // main loop
+	}
+}
+
+__global__ __launch_bounds__(256, 1)
+void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t shift = SHIFT * thread;
+
+		for (int i = 5; i < HASH_MEMORY - 1; i++)
+		{
+			uint32_t randmax = i*32-4;
+			uint32_t randseed[16];
+			uint32_t randbuffer[16];
+			uint32_t joint[16];
+			uint8 Buffbuffer[2];
+
+			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]);
+			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]);
+
+
+			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]);
+			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]);
+
+			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+			((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]);
+			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]);
+
+			#pragma unroll
+			for (int j = 0; j < 8; j++) {
+				uint32_t rand = randbuffer[j] % (randmax - 32);
+				joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+			}
+
+			uint8 truc =  sha256_64(joint);
+			((uint8*)(hashbuffer + shift))[i] = truc;
+			((uint8*)randseed)[0] = ((uint8*)joint)[0];
+			((uint8*)randseed)[1] = truc;
+
+			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+			for (int j = 0; j < 32; j += 2)
+			{
+				uint32_t rand = randbuffer[j / 2] % randmax;
+
+				(hashbuffer+shift)[rand] =       __ldg(&(hashbuffer+shift)[randmax+j]);
+				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+			}
+
+		} // main loop
+
+		uint32_t outbuf =  __ldgtoint(&(hashbuffer + shift)[28]);
+
+		if (outbuf <= pTarget[7]) {
+			nonceVector[0] = nonce;
+		}
+
+	}
+}
+
+__global__ __launch_bounds__(128, 3)
+void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t shift = SHIFT * thread;
+		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
+		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
+		for (int i = 2; i < 5; i++)
+		{
+			uint32_t randmax = i * 32 - 4;
+			uint32_t randseed[16];
+			uint32_t randbuffer[16];
+			uint32_t joint[16];
+			uint8 Buffbuffer[2];
+
+			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+//			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+			((uint8*)joint)[0] = ((uint8*)randseed)[1];
+
+			#pragma unroll
+			for (int j = 0; j < 8; j++) {
+				uint32_t rand = randbuffer[j] % (randmax - 32);
+				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+			}
+
+			uint8 truc = sha256_64(joint);
+			((uint8*)(hashbuffer + shift))[i] = truc;
+			((uint8*)randseed)[0] = ((uint8*)joint)[0];
+			((uint8*)randseed)[1] = truc;
+
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+			for (int j = 0; j < 32; j += 2)
+			{
+				uint32_t rand = randbuffer[j / 2] % randmax;
+				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+			}
+
+		} // main loop
+
+	}
+}
+
+__global__ __launch_bounds__(128, 3)
+void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t shift = SHIFT * thread;
+
+		for (int i = 5; i < HASH_MEMORY - 1; i++)
+		{
+			uint32_t randmax = i * 32 - 4;
+			uint32_t randseed[16];
+			uint32_t randbuffer[16];
+			uint32_t joint[16];
+			uint8 Buffbuffer[2];
+
+			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+
+			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]);
+			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]);
+			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+
+			#pragma unroll
+			for (int j = 0; j < 8; j++)
+			{
+				uint32_t rand = randbuffer[j] % (randmax - 32);
+				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+			}
+
+			uint8 truc = sha256_64(joint);
+			((uint8*)(hashbuffer + shift))[i] = truc;
+			((uint8*)randseed)[0] = ((uint8*)joint)[0];
+			((uint8*)randseed)[1] = truc;
+
+
+			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+			for (int j = 0; j < 32; j += 2)
+			{
+				uint32_t rand = randbuffer[j / 2] % randmax;
+
+				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+			}
+
+		} // main loop
+
+		uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
+
+		if (outbuf <= pTarget[7]) {
+			nonceVector[0] = nonce;
+		}
+
+	}
+}
+
+void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
+{
+	cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
+	cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t));
+}
+
+__host__
+uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int order)
+{
+	uint32_t result[8] = {0xffffffff};
+	cudaMemset(d_PlNonce[thr_id], 0xffffffff, sizeof(uint32_t));
+
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	dim3 grid50((threads + 256 - 1) / 256);
+	dim3 block50(256);
+
+	if (device_sm[device_map[thr_id]] >= 500) {
+		pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
+		pluck_gpu_hash_v50  <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
+	} else {
+		pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce);
+		pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
+	}
+
+	MyStreamSynchronize(NULL, order, thr_id);
+	cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	return result[thr_id];
+}
+
+__host__
+void pluck_setBlockTarget(const void *pdata, const void *ptarget)
+{
+	unsigned char PaddedMessage[80];
+	memcpy(PaddedMessage, pdata, 80);
+
+	cudaMemcpyToSymbol(c_data, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
--- a/pluck/pluck.cu
+++ b/pluck/pluck.cu
@ -0,0 +1,270 @@
+/* Based on djm code */
+
+extern "C" {
+#include "miner.h"
+}
+
+#include <stdint.h>
+
+static uint32_t *d_hash[MAX_GPUS] ;
+
+extern void pluck_setBlockTarget(const void* data, const void *ptarget);
+extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
+extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);
+
+extern float tp_coef[MAX_GPUS];
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+//note, this is 64 bytes
+static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
+{
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+	uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
+	int i;
+
+	x00 = (B[0] ^= Bx[0]);
+	x01 = (B[1] ^= Bx[1]);
+	x02 = (B[2] ^= Bx[2]);
+	x03 = (B[3] ^= Bx[3]);
+	x04 = (B[4] ^= Bx[4]);
+	x05 = (B[5] ^= Bx[5]);
+	x06 = (B[6] ^= Bx[6]);
+	x07 = (B[7] ^= Bx[7]);
+	x08 = (B[8] ^= Bx[8]);
+	x09 = (B[9] ^= Bx[9]);
+	x10 = (B[10] ^= Bx[10]);
+	x11 = (B[11] ^= Bx[11]);
+	x12 = (B[12] ^= Bx[12]);
+	x13 = (B[13] ^= Bx[13]);
+	x14 = (B[14] ^= Bx[14]);
+	x15 = (B[15] ^= Bx[15]);
+	for (i = 0; i < 8; i += 2) {
+		/* Operate on columns. */
+		x04 ^= ROTL(x00 + x12, 7);  x09 ^= ROTL(x05 + x01, 7);
+		x14 ^= ROTL(x10 + x06, 7);  x03 ^= ROTL(x15 + x11, 7);
+
+		x08 ^= ROTL(x04 + x00, 9);  x13 ^= ROTL(x09 + x05, 9);
+		x02 ^= ROTL(x14 + x10, 9);  x07 ^= ROTL(x03 + x15, 9);
+
+		x12 ^= ROTL(x08 + x04, 13);  x01 ^= ROTL(x13 + x09, 13);
+		x06 ^= ROTL(x02 + x14, 13);  x11 ^= ROTL(x07 + x03, 13);
+
+		x00 ^= ROTL(x12 + x08, 18);  x05 ^= ROTL(x01 + x13, 18);
+		x10 ^= ROTL(x06 + x02, 18);  x15 ^= ROTL(x11 + x07, 18);
+
+		/* Operate on rows. */
+		x01 ^= ROTL(x00 + x03, 7);  x06 ^= ROTL(x05 + x04, 7);
+		x11 ^= ROTL(x10 + x09, 7);  x12 ^= ROTL(x15 + x14, 7);
+
+		x02 ^= ROTL(x01 + x00, 9);  x07 ^= ROTL(x06 + x05, 9);
+		x08 ^= ROTL(x11 + x10, 9);  x13 ^= ROTL(x12 + x15, 9);
+
+		x03 ^= ROTL(x02 + x01, 13);  x04 ^= ROTL(x07 + x06, 13);
+		x09 ^= ROTL(x08 + x11, 13);  x14 ^= ROTL(x13 + x12, 13);
+
+		x00 ^= ROTL(x03 + x02, 18);  x05 ^= ROTL(x04 + x07, 18);
+		x10 ^= ROTL(x09 + x08, 18);  x15 ^= ROTL(x14 + x13, 18);
+	}
+	B[0] += x00;
+	B[1] += x01;
+	B[2] += x02;
+	B[3] += x03;
+	B[4] += x04;
+	B[5] += x05;
+	B[6] += x06;
+	B[7] += x07;
+	B[8] += x08;
+	B[9] += x09;
+	B[10] += x10;
+	B[11] += x11;
+	B[12] += x12;
+	B[13] += x13;
+	B[14] += x14;
+	B[15] += x15;
+#undef ROTL
+}
+
+static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
+{
+	uint32_t S[16], T[16];
+	int i, r;
+
+	sha256_init(S);
+	for (r = len; r > -9; r -= 64) {
+		if (r < 64)
+			memset(T, 0, 64);
+		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+		if (r >= 0 && r < 64)
+			((unsigned char *)T)[r] = 0x80;
+		for (i = 0; i < 16; i++)
+			T[i] = be32dec(T + i);
+
+		if (r < 56)
+			T[15] = 8 * len;
+		sha256_transform(S, T, 0);
+	}
+	for (i = 0; i < 8; i++)
+		be32enc((uint32_t *)hash + i, S[i]);
+}
+
+static void sha256_hash512(unsigned char *hash, const unsigned char *data)
+{
+	uint32_t S[16], T[16];
+	int i;
+
+	sha256_init(S);
+
+	memcpy(T, data, 64);
+	for (i = 0; i < 16; i++)
+		T[i] = be32dec(T + i);
+	sha256_transform(S, T, 0);
+
+	memset(T, 0, 64);
+	//memcpy(T, data + 64, 0);
+	((unsigned char *)T)[0] = 0x80;
+	for (i = 0; i < 16; i++)
+		T[i] = be32dec(T + i);
+	T[15] = 8 * 64;
+	sha256_transform(S, T, 0);
+
+	for (i = 0; i < 8; i++)
+		be32enc((uint32_t *)hash + i, S[i]);
+}
+
+void pluckhash(uint32_t *hash, uint32_t *input)
+{
+
+	uint32_t data[20];
+	//uint32_t midstate[8];
+
+	const int HASH_MEMORY = 128 * 1024;
+	uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);
+
+	for (int k = 0; k<20; k++) { data[k] = input[k]; }
+
+		uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
+		int size = HASH_MEMORY;
+		memset(hashbuffer, 0, 64);
+
+		sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
+		for (int i = 64; i < size - 32; i += 32)
+		{
+			//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
+			int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
+			uint32_t joint[16];
+			uint32_t randbuffer[16];
+
+			uint32_t randseed[16];
+			memcpy(randseed, &hashbuffer[i - 64], 64);
+			if (i>128)
+			{
+				memcpy(randbuffer, &hashbuffer[i - 128], 64);
+			}
+			else
+			{
+				memset(&randbuffer, 0, 64);
+			}
+
+			xor_salsa8(randbuffer, randseed);
+
+			memcpy(joint, &hashbuffer[i - 32], 32);
+			//use the last hash value as the seed
+			for (int j = 32; j < 64; j += 4)
+			{
+				uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
+				joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
+			}
+			sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
+//			for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
+			memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
+			if (i>128)
+			{
+				memcpy(randbuffer, &hashbuffer[i - 128], 64);
+			}
+			else
+			{
+				memset(randbuffer, 0, 64);
+			}
+			xor_salsa8(randbuffer, randseed);
+			for (int j = 0; j < 32; j += 2)
+			{
+				uint32_t rand = randbuffer[j / 2] % randmax;
+				*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
+			}
+		}
+
+//		for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
+
+		//note: off-by-one error is likely here...
+/*
+		for (int i = size - 64 - 1; i >= 64; i -= 64)
+		{
+			sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
+		}
+
+		for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
+*/
+		memcpy((unsigned char*)hash, hashbuffer, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+
+	int intensity = 18; /* beware > 20 could work and create diff problems later */
+	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
+	// divide by 128 for this algo which require a lot of memory
+	throughput = throughput / 128 - 256;
+	throughput = min(throughput, max_nonce - first_nonce + 1);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		//cudaDeviceReset();
+		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+		cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);
+
+		pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
+		init[thr_id] = true;
+	}
+
+
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	pluck_setBlockTarget(endiandata,ptarget);
+
+	do {
+		uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
+		if (foundNonce != UINT32_MAX)
+		{
+//			const uint32_t Htarg = ptarget[7];
+//			uint32_t vhash64[8];
+//			be32enc(&endiandata[19], foundNonce);
+//			pluckhash(vhash64,endiandata);
+//			printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
+//			if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				pdata[19] = foundNonce;
+				return 1;
+//			} else {
+//				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+//			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}