pluck: adaptation from djm repo

remains the cpu validation check to do... throughput for this algo is divided by 128 to keep same kind of intensity values (default 18.0)
10 years ago · 7939dce0aa
9 changed files with 1164 additions and 5 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -46,6 +46,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
 			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
 			  pluck/pluck.cu pluck/cuda_pluck.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -97,6 +97,7 @@ enum sha_algos {
 	ALGO_MYR_GR,
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
 	ALGO_PLUCK,
 	ALGO_QUARK,
 	ALGO_QUBIT,
 	ALGO_S3,
@ -128,6 +129,7 @@ static const char *algo_names[] = {
 	"myr-gr",
 	"nist5",
 	"penta",
 	"pluck",
 	"quark",
 	"qubit",
 	"s3",
@ -237,6 +239,7 @@ Options:\n\
 			myr-gr      Myriad-Groestl\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
 			pluck       SupCoin\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
 			s3          S3 (1Coin)\n\
@ -1059,6 +1062,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	switch (opt_algo) {
 		case ALGO_JACKPOT:
 		case ALGO_PLUCK:
 			diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
 		case ALGO_DMD_GR:
@ -1272,6 +1276,9 @@ static void *miner_thread(void *userdata)
 			case ALGO_LYRA2:
 				minmax = 0x100000;
 				break;
 			case ALGO_PLUCK:
 				minmax = 0x2000;
 				break;
 			}
 			max64 = max(minmax-1, max64);
 		}
@ -1397,6 +1404,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_PLUCK:
 			rc = scanhash_pluck(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@ -2237,8 +2249,8 @@ int main(int argc, char *argv[])
 	printf("    Built with the nVidia CUDA SDK 6.5\n\n");
 #endif
 	printf("  Originally based on cudaminer by Christian Buchner and Christian H.,\n");
-	printf("  Include some of djm34 additions and sp optimisations\n");
+	printf("  Include some work of djm34, sp, tsiv and klausT\n\n");
-	printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
+	printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -313,6 +313,7 @@
    <ClInclude Include="cpuminer-config.h" />
    <ClInclude Include="cuda_groestlcoin.h" />
    <ClInclude Include="cuda_helper.h" />
    <ClInclude Include="cuda_vector.h" />
    <ClInclude Include="elist.h" />
    <ClInclude Include="heavy\heavy.h" />
    <ClInclude Include="hefty1.h" />
@ -416,6 +417,8 @@
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="pluck\pluck.cu" />
    <CudaCompile Include="pluck\cuda_pluck.cu" />
    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -308,6 +308,9 @@
    <ClInclude Include="cuda_helper.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
    <ClInclude Include="cuda_vector.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
    <ClInclude Include="sph\sph_hamsi.h">
      <Filter>Header Files\sph</Filter>
    </ClInclude>
@ -394,6 +397,12 @@
    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
      <Filter>Source Files\CUDA\JHA</Filter>
    </CudaCompile>
    <CudaCompile Include="pluck\pluck.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="pluck\cuda_pluck.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_jh512.cu">
      <Filter>Source Files\CUDA\quark</Filter>
    </CudaCompile>
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -57,11 +57,13 @@ extern const uint3 threadIdx;
 #endif
 #if __CUDA_ARCH__ < 320
-// Kepler (Compute 3.0)
+// Host and Compute 3.0
 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
-// Kepler (Compute 3.5, 5.0)
+// Compute 3.2+
 #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 #endif
 __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
@ -285,6 +287,45 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 	return result;
 }
 // device asm 32 for pluck
 __device__ __forceinline__
 uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
 	uint32_t result;
 	asm("{ .reg .u32 m,n,o;\n\t"
 		"and.b32 m,  %1, %2;\n\t"
 		" or.b32 n,  %1, %2;\n\t"
 		"and.b32 o,   n, %3;\n\t"
 		" or.b32 %0,  m, o ;\n\t"
 		"}\n\t"
 		: "=r"(result) : "r"(a), "r"(b), "r"(c));
 	return result;
 }
 __device__ __forceinline__
 uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
 	uint32_t result;
 	asm("{ .reg .u32 t1;\n\t"
 		"xor.b32 t1, %2, %3;\n\t"
 		"xor.b32 %0, %1, t1;\n\t"
 		"}"
 		: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
 	return result;
 }
 __device__ __forceinline__
 uint32_t shr_t32(uint32_t x,uint32_t n) {
 	uint32_t result;
 	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
 	return result;
 }
 __device__ __forceinline__
 uint32_t shl_t32(uint32_t x,uint32_t n) {
 	uint32_t result;
 	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
 	return result;
 }
 #ifndef USE_ROT_ASM_OPT
 #define USE_ROT_ASM_OPT 1
 #endif
--- a/cuda_vector.h
+++ b/cuda_vector.h
@ -0,0 +1,245 @@
 #ifndef CUDA_VECTOR_H
 #define CUDA_VECTOR_H
 ///////////////////////////////////////////////////////////////////////////////////
 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
 #define __LDG_PTR   "l"
 #else
 #define __LDG_PTR   "r"
 #endif
 #include "cuda_helper.h"
 //typedef __device_builtin__ struct ulong16 ulong16;
 typedef struct __align__(32) uint8
 {
 	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
 } uint8;
 typedef struct __align__(64) uint16
 {
 	union {
 		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
 		uint8 lo;
 	};
 	union {
 		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
 		uint8 hi;
 	};
 } uint16;
 static __inline__ __host__ __device__ uint16 make_uint16(
 	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
 	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
 {
 	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
 	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
 	return t;
 }
 static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
 {
 	uint16 t; t.lo=a; t.hi=b; return t;
 }
 static __inline__ __host__ __device__ uint8 make_uint8(
 	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
 {
 	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
 	return t;
 }
 static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
 static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
 static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
 static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
 static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
 static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
 static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
 static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
 static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
 static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
 	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
 		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
 }
 static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
 	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
 		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
 }
 static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
 static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
 static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
 static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
 static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
 static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
 static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
 static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
 static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
 static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
 static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
 {
 	uint32_t ret;
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
 	return ret;
 }
 static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
 {
 	uint32_t ret;
 	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
 	return ret;
 }
 static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
 {
 	uint8 test;
 	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
 	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
 	return (test);
 }
 static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
 {
 	uint32_t test;
 	asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
 	return (test);
 }
 static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
 {
 	uint64_t test;
 	asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
 	return (test);
 }
 static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
 {
 	uint32_t test;
 	asm volatile ("{\n\t"
 		".reg .u8 a,b,c,d; \n\t"
 	"ld.global.nc.u8 a,[%1]; \n\t"
 	"ld.global.nc.u8 b,[%1+1]; \n\t"
 	"ld.global.nc.u8 c,[%1+2]; \n\t"
 	"ld.global.nc.u8 d,[%1+3]; \n\t"
 	"mov.b32 %0,{a,b,c,d}; }\n\t"
 		: "=r"(test) : __LDG_PTR(ptr));
 	return (test);
 }
 static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
 {
 	uint64_t test;
 	asm volatile ("{\n\t"
 		".reg .u8 a,b,c,d,e,f,g,h; \n\t"
 		".reg .u32 i,j; \n\t"
 		"ld.global.nc.u8 a,[%1]; \n\t"
 		"ld.global.nc.u8 b,[%1+1]; \n\t"
 		"ld.global.nc.u8 c,[%1+2]; \n\t"
 		"ld.global.nc.u8 d,[%1+3]; \n\t"
 		"ld.global.nc.u8 e,[%1+4]; \n\t"
 		"ld.global.nc.u8 f,[%1+5]; \n\t"
 		"ld.global.nc.u8 g,[%1+6]; \n\t"
 		"ld.global.nc.u8 h,[%1+7]; \n\t"
 		 "mov.b32 i,{a,b,c,d}; \n\t"
         "mov.b32 j,{e,f,g,h}; \n\t"
 		 "mov.b64 %0,{i,j}; }\n\t"
 		: "=l"(test) : __LDG_PTR(ptr));
 	return (test);
 }
 static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
 {
 	uint32_t zero = 0;
 	uint64_t test;
 	asm volatile ("{\n\t"
 		".reg .u8 a,b,c,d; \n\t"
 		".reg .u32 i; \n\t"
 		"ld.global.nc.u8 a,[%1]; \n\t"
 		"ld.global.nc.u8 b,[%1+1]; \n\t"
 		"ld.global.nc.u8 c,[%1+2]; \n\t"
 		"ld.global.nc.u8 d,[%1+3]; \n\t"
 		"mov.b32 i,{a,b,c,d}; \n\t"
 		"mov.b64 %0,{i,%1}; }\n\t"
 		: "=l"(test) : __LDG_PTR(ptr), "r"(zero));
 	return (test);
 }
 static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
 {
 	uint32_t test;
 	asm("{\n\t"
 		".reg .u8 e,b,c,d; \n\t"
 		"ld.global.nc.u8 e,[%1]; \n\t"
 		"ld.global.nc.u8 b,[%1+1]; \n\t"
 		"ld.global.nc.u8 c,[%1+2]; \n\t"
 		"ld.global.nc.u8 d,[%1+3]; \n\t"
 		"mov.b32 %0,{e,b,c,d}; }\n\t"
 		: "=r"(test) : __LDG_PTR(ptr));
 	return (test);
 }
 static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
 {
 	uint8 vec;
 	vec.s0 = cuda_swab32(buf[0].s0);
 	vec.s1 = cuda_swab32(buf[0].s1);
 	vec.s2 = cuda_swab32(buf[0].s2);
 	vec.s3 = cuda_swab32(buf[0].s3);
 	vec.s4 = cuda_swab32(buf[0].s4);
 	vec.s5 = cuda_swab32(buf[0].s5);
 	vec.s6 = cuda_swab32(buf[0].s6);
 	vec.s7 = cuda_swab32(buf[0].s7);
 	return vec;
 }
 static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
 {
 	uint16 vec;
 	vec.s0 = cuda_swab32(buf[0].s0);
 	vec.s1 = cuda_swab32(buf[0].s1);
 	vec.s2 = cuda_swab32(buf[0].s2);
 	vec.s3 = cuda_swab32(buf[0].s3);
 	vec.s4 = cuda_swab32(buf[0].s4);
 	vec.s5 = cuda_swab32(buf[0].s5);
 	vec.s6 = cuda_swab32(buf[0].s6);
 	vec.s7 = cuda_swab32(buf[0].s7);
 	vec.s8 = cuda_swab32(buf[0].s8);
 	vec.s9 = cuda_swab32(buf[0].s9);
 	vec.sa = cuda_swab32(buf[0].sa);
 	vec.sb = cuda_swab32(buf[0].sb);
 	vec.sc = cuda_swab32(buf[0].sc);
 	vec.sd = cuda_swab32(buf[0].sd);
 	vec.se = cuda_swab32(buf[0].se);
 	vec.sf = cuda_swab32(buf[0].sf);
 	return vec;
 }
 #endif // #ifndef CUDA_VECTOR_H
--- a/miner.h
+++ b/miner.h
@ -334,6 +334,10 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_pluck(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_qubit(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
--- a/pluck/cuda_pluck.cu
+++ b/pluck/cuda_pluck.cu
@ -0,0 +1,574 @@
 /*
 * "pluck" kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2015  djm34
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author djm34
 * @author tpruvot
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <memory.h>
 #include "cuda_helper.h"
 #include "cuda_vector.h"
 uint32_t *d_PlNonce[MAX_GPUS];
 __device__  uint8_t *  hashbuffer;
 __constant__  uint32_t pTarget[8];
 __constant__  uint32_t c_data[20];
 #define HASH_MEMORY_8bit 131072
 #define HASH_MEMORY_32bit 32768
 #define HASH_MEMORY 4096
 static __constant__  uint32_t H256[8] = {
 	0x6A09E667, 0xBB67AE85, 0x3C6EF372,
 	0xA54FF53A, 0x510E527F, 0x9B05688C,
 	0x1F83D9AB, 0x5BE0CD19
 };
 static  __constant__  uint32_t Ksha[64] = {
 	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
 	0x3956C25B, 0x59F111F1,	0x923F82A4, 0xAB1C5ED5,
 	0xD807AA98, 0x12835B01,	0x243185BE, 0x550C7DC3,
 	0x72BE5D74, 0x80DEB1FE,	0x9BDC06A7, 0xC19BF174,
 	0xE49B69C1, 0xEFBE4786,	0x0FC19DC6, 0x240CA1CC,
 	0x2DE92C6F, 0x4A7484AA,	0x5CB0A9DC, 0x76F988DA,
 	0x983E5152, 0xA831C66D,	0xB00327C8, 0xBF597FC7,
 	0xC6E00BF3, 0xD5A79147,	0x06CA6351, 0x14292967,
 	0x27B70A85, 0x2E1B2138,	0x4D2C6DFC, 0x53380D13,
 	0x650A7354, 0x766A0ABB,	0x81C2C92E, 0x92722C85,
 	0xA2BFE8A1, 0xA81A664B,	0xC24B8B70, 0xC76C51A3,
 	0xD192E819, 0xD6990624,	0xF40E3585, 0x106AA070,
 	0x19A4C116, 0x1E376C08,	0x2748774C, 0x34B0BCB5,
 	0x391C0CB3, 0x4ED8AA4A,	0x5B9CCA4F, 0x682E6FF3,
 	0x748F82EE, 0x78A5636F,	0x84C87814, 0x8CC70208,
 	0x90BEFFFA, 0xA4506CEB,	0xBEF9A3F7, 0xC67178F2
 };
 #define SALSA(a,b,c,d) { \
    t = a+d; b^=rotate(t,  7); \
    t = b+a; c^=rotate(t,  9); \
    t = c+b; d^=rotate(t, 13); \
    t = d+c; a^=rotate(t, 18); \
 }
 #define SALSA_CORE(state) { \
 	SALSA(state.s0,state.s4,state.s8,state.sc); \
 	SALSA(state.s5,state.s9,state.sd,state.s1); \
 	SALSA(state.sa,state.se,state.s2,state.s6); \
 	SALSA(state.sf,state.s3,state.s7,state.sb); \
 	SALSA(state.s0,state.s1,state.s2,state.s3); \
 	SALSA(state.s5,state.s6,state.s7,state.s4); \
 	SALSA(state.sa,state.sb,state.s8,state.s9); \
 	SALSA(state.sf,state.sc,state.sd,state.se); \
 }
 static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx)
 {
 	uint32_t t;
 	uint16 state = Bx;
 	SALSA_CORE(state);
 	SALSA_CORE(state);
 	SALSA_CORE(state);
 	SALSA_CORE(state);
 	return(state+Bx);
 }
 // sha256
 static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
 {
 	uint32_t r1 = ROTR32(x, 2);
 	uint32_t r2 = ROTR32(x, 13);
 	uint32_t r3 = ROTR32(x, 22);
 	return xor3b(r1, r2, r3);
 }
 static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
 {
 	uint32_t r1 = ROTR32(x, 6);
 	uint32_t r2 = ROTR32(x, 11);
 	uint32_t r3 = ROTR32(x, 25);
 	return xor3b(r1, r2, r3);
 }
 static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
 {
 	uint64_t r1 = ROTR32(x, 7);
 	uint64_t r2 = ROTR32(x, 18);
 	uint64_t r3 = shr_t32(x, 3);
 	return xor3b(r1, r2, r3);
 }
 static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
 {
 	uint64_t r1 = ROTR32(x, 17);
 	uint64_t r2 = ROTR32(x, 19);
 	uint64_t r3 = shr_t32(x, 10);
 	return xor3b(r1, r2, r3);
 }
 static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
 	const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared)
 {
 	uint32_t t1, t2;
 	uint32_t vxandx = xandx(e, f, g);
 	uint32_t bsg21 = bsg2_1(e);
 	uint32_t bsg20 = bsg2_0(a);
 	uint32_t andorv = andor32(a, b, c);
 	t1 = h + bsg21 + vxandx + Kshared + in;
 	t2 = bsg20 + andorv;
 	d = d + t1;
 	h = t1 + t2;
 }
 static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
 	const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared)
 {
 	uint32_t t1, t2;
 	int pcidx1 = (pc - 2) & 0xF;
 	int pcidx2 = (pc - 7) & 0xF;
 	int pcidx3 = (pc - 15) & 0xF;
 	uint32_t inx0 = in[pc];
 	uint32_t inx1 = in[pcidx1];
 	uint32_t inx2 = in[pcidx2];
 	uint32_t inx3 = in[pcidx3];
 	uint32_t ssg21 = ssg2_1(inx1);
 	uint32_t ssg20 = ssg2_0(inx3);
 	uint32_t vxandx = xandx(e, f, g);
 	uint32_t bsg21 = bsg2_1(e);
 	uint32_t bsg20 = bsg2_0(a);
 	uint32_t andorv = andor32(a, b, c);
 	in[pc] = ssg21 + inx2 + ssg20 + inx0;
 	t1 = h + bsg21 + vxandx + Kshared + in[pc];
 	t2 = bsg20 + andorv;
 	d = d + t1;
 	h = t1 + t2;
 }
 static __device__ __forceinline__
 void sha2_round_body(uint32_t* in, uint32_t* r)
 {
 	uint32_t a = r[0];
 	uint32_t b = r[1];
 	uint32_t c = r[2];
 	uint32_t d = r[3];
 	uint32_t e = r[4];
 	uint32_t f = r[5];
 	uint32_t g = r[6];
 	uint32_t h = r[7];
 	sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]);
 	sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]);
 	sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]);
 	sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]);
 	sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]);
 	sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]);
 	sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]);
 	sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]);
 	sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]);
 	sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]);
 	sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]);
 	sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]);
 	sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]);
 	sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]);
 	sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]);
 	sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]);
 	#pragma unroll 3
 	for (int i = 0; i<3; i++) {
 		sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]);
 		sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]);
 		sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]);
 		sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]);
 		sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]);
 		sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]);
 		sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]);
 		sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]);
 		sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]);
 		sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]);
 		sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]);
 		sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]);
 		sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]);
 		sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]);
 		sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]);
 		sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]);
 	}
 	r[0] += a;
 	r[1] += b;
 	r[2] += c;
 	r[3] += d;
 	r[4] += e;
 	r[5] += f;
 	r[6] += g;
 	r[7] += h;
 }
 static __device__ __forceinline__ uint8 sha256_64(uint32_t *data)
 {
 	uint32_t __align__(64) in[16];
 	uint32_t __align__(32) buf[8];
 	((uint16 *)in)[0] = swapvec((uint16*)data);
 	((uint8*)buf)[0] = ((uint8*)H256)[0];
 	sha2_round_body(in, buf);
 	#pragma unroll 14
 	for (int i = 0; i<14; i++) { in[i + 1] = 0; }
 	in[0] = 0x80000000;
 	in[15] = 0x200;
 	sha2_round_body(in, buf);
 	return swapvec((uint8*)buf);
 }
 static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce)
 {
 //	uint32_t in[16], buf[8];
 	uint32_t __align__(64) in[16];
 	uint32_t __align__(32) buf[8];
 	((uint16 *)in)[0] = swapvec((uint16*)c_data);
 	((uint8*)buf)[0] = ((uint8*)H256)[0];
 	sha2_round_body(in, buf);
 	#pragma unroll 3
 	for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); }
 //	in[3] = cuda_swab32(nonce);
    in[3] = nonce;
 	in[4] = 0x80000000;
 	in[15] = 0x280;
 	#pragma unroll
 	for (int i = 5; i<15; i++) { in[i] = 0; }
 	sha2_round_body(in, buf);
 	return swapvec((uint8*)buf);
 }
 #define SHIFT 32 * 1024 * 4
 __global__ __launch_bounds__(256, 1)
 void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t shift = SHIFT * thread;
 		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
 		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
 		for (int i = 2; i < 5; i++)
 		{
 			uint32_t randmax = i * 32 - 4;
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
 			uint8 Buffbuffer[2];
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 //			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
 			((uint8*)joint)[0] = ((uint8*)randseed)[1];
 			#pragma unroll
 			for (int j = 0; j < 8; j++) {
 				uint32_t rand = randbuffer[j] % (randmax - 32);
 				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
 			}
 			uint8 truc = sha256_64(joint);
 			((uint8*)(hashbuffer + shift))[i] = truc;
 			((uint8*)randseed)[0] = ((uint8*)joint)[0];
 			((uint8*)randseed)[1] = truc;
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
 				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
 				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
 		} // main loop
 	}
 }
 __global__ __launch_bounds__(256, 1)
 void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t shift = SHIFT * thread;
 		for (int i = 5; i < HASH_MEMORY - 1; i++)
 		{
 			uint32_t randmax = i*32-4;
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
 			uint8 Buffbuffer[2];
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]);
 			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]);
 			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]);
 			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
 			((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]);
 			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]);
 			#pragma unroll
 			for (int j = 0; j < 8; j++) {
 				uint32_t rand = randbuffer[j] % (randmax - 32);
 				joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
 			}
 			uint8 truc =  sha256_64(joint);
 			((uint8*)(hashbuffer + shift))[i] = truc;
 			((uint8*)randseed)[0] = ((uint8*)joint)[0];
 			((uint8*)randseed)[1] = truc;
 			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
 				(hashbuffer+shift)[rand] =       __ldg(&(hashbuffer+shift)[randmax+j]);
 				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
 		} // main loop
 		uint32_t outbuf =  __ldgtoint(&(hashbuffer + shift)[28]);
 		if (outbuf <= pTarget[7]) {
 			nonceVector[0] = nonce;
 		}
 	}
 }
 __global__ __launch_bounds__(128, 3)
 void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t shift = SHIFT * thread;
 		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
 		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
 		for (int i = 2; i < 5; i++)
 		{
 			uint32_t randmax = i * 32 - 4;
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
 			uint8 Buffbuffer[2];
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 //			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
 			((uint8*)joint)[0] = ((uint8*)randseed)[1];
 			#pragma unroll
 			for (int j = 0; j < 8; j++) {
 				uint32_t rand = randbuffer[j] % (randmax - 32);
 				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
 			}
 			uint8 truc = sha256_64(joint);
 			((uint8*)(hashbuffer + shift))[i] = truc;
 			((uint8*)randseed)[0] = ((uint8*)joint)[0];
 			((uint8*)randseed)[1] = truc;
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
 				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
 				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
 		} // main loop
 	}
 }
 __global__ __launch_bounds__(128, 3)
 void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t shift = SHIFT * thread;
 		for (int i = 5; i < HASH_MEMORY - 1; i++)
 		{
 			uint32_t randmax = i * 32 - 4;
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
 			uint8 Buffbuffer[2];
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
 			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]);
 			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]);
 			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
 			#pragma unroll
 			for (int j = 0; j < 8; j++)
 			{
 				uint32_t rand = randbuffer[j] % (randmax - 32);
 				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
 			}
 			uint8 truc = sha256_64(joint);
 			((uint8*)(hashbuffer + shift))[i] = truc;
 			((uint8*)randseed)[0] = ((uint8*)joint)[0];
 			((uint8*)randseed)[1] = truc;
 			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
 				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
 				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
 		} // main loop
 		uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
 		if (outbuf <= pTarget[7]) {
 			nonceVector[0] = nonce;
 		}
 	}
 }
 void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
 {
 	cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
 	cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t));
 }
 __host__
 uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int order)
 {
 	uint32_t result[8] = {0xffffffff};
 	cudaMemset(d_PlNonce[thr_id], 0xffffffff, sizeof(uint32_t));
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	dim3 grid50((threads + 256 - 1) / 256);
 	dim3 block50(256);
 	if (device_sm[device_map[thr_id]] >= 500) {
 		pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
 		pluck_gpu_hash_v50  <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
 	} else {
 		pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce);
 		pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
 	}
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return result[thr_id];
 }
 __host__
 void pluck_setBlockTarget(const void *pdata, const void *ptarget)
 {
 	unsigned char PaddedMessage[80];
 	memcpy(PaddedMessage, pdata, 80);
 	cudaMemcpyToSymbol(c_data, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
--- a/pluck/pluck.cu
+++ b/pluck/pluck.cu
@ -0,0 +1,270 @@
 /* Based on djm code */
 extern "C" {
 #include "miner.h"
 }
 #include <stdint.h>
 static uint32_t *d_hash[MAX_GPUS] ;
 extern void pluck_setBlockTarget(const void* data, const void *ptarget);
 extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
 extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);
 extern float tp_coef[MAX_GPUS];
 #define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
 //note, this is 64 bytes
 static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
 {
 #define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
 	uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
 	int i;
 	x00 = (B[0] ^= Bx[0]);
 	x01 = (B[1] ^= Bx[1]);
 	x02 = (B[2] ^= Bx[2]);
 	x03 = (B[3] ^= Bx[3]);
 	x04 = (B[4] ^= Bx[4]);
 	x05 = (B[5] ^= Bx[5]);
 	x06 = (B[6] ^= Bx[6]);
 	x07 = (B[7] ^= Bx[7]);
 	x08 = (B[8] ^= Bx[8]);
 	x09 = (B[9] ^= Bx[9]);
 	x10 = (B[10] ^= Bx[10]);
 	x11 = (B[11] ^= Bx[11]);
 	x12 = (B[12] ^= Bx[12]);
 	x13 = (B[13] ^= Bx[13]);
 	x14 = (B[14] ^= Bx[14]);
 	x15 = (B[15] ^= Bx[15]);
 	for (i = 0; i < 8; i += 2) {
 		/* Operate on columns. */
 		x04 ^= ROTL(x00 + x12, 7);  x09 ^= ROTL(x05 + x01, 7);
 		x14 ^= ROTL(x10 + x06, 7);  x03 ^= ROTL(x15 + x11, 7);
 		x08 ^= ROTL(x04 + x00, 9);  x13 ^= ROTL(x09 + x05, 9);
 		x02 ^= ROTL(x14 + x10, 9);  x07 ^= ROTL(x03 + x15, 9);
 		x12 ^= ROTL(x08 + x04, 13);  x01 ^= ROTL(x13 + x09, 13);
 		x06 ^= ROTL(x02 + x14, 13);  x11 ^= ROTL(x07 + x03, 13);
 		x00 ^= ROTL(x12 + x08, 18);  x05 ^= ROTL(x01 + x13, 18);
 		x10 ^= ROTL(x06 + x02, 18);  x15 ^= ROTL(x11 + x07, 18);
 		/* Operate on rows. */
 		x01 ^= ROTL(x00 + x03, 7);  x06 ^= ROTL(x05 + x04, 7);
 		x11 ^= ROTL(x10 + x09, 7);  x12 ^= ROTL(x15 + x14, 7);
 		x02 ^= ROTL(x01 + x00, 9);  x07 ^= ROTL(x06 + x05, 9);
 		x08 ^= ROTL(x11 + x10, 9);  x13 ^= ROTL(x12 + x15, 9);
 		x03 ^= ROTL(x02 + x01, 13);  x04 ^= ROTL(x07 + x06, 13);
 		x09 ^= ROTL(x08 + x11, 13);  x14 ^= ROTL(x13 + x12, 13);
 		x00 ^= ROTL(x03 + x02, 18);  x05 ^= ROTL(x04 + x07, 18);
 		x10 ^= ROTL(x09 + x08, 18);  x15 ^= ROTL(x14 + x13, 18);
 	}
 	B[0] += x00;
 	B[1] += x01;
 	B[2] += x02;
 	B[3] += x03;
 	B[4] += x04;
 	B[5] += x05;
 	B[6] += x06;
 	B[7] += x07;
 	B[8] += x08;
 	B[9] += x09;
 	B[10] += x10;
 	B[11] += x11;
 	B[12] += x12;
 	B[13] += x13;
 	B[14] += x14;
 	B[15] += x15;
 #undef ROTL
 }
 static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
 {
 	uint32_t S[16], T[16];
 	int i, r;
 	sha256_init(S);
 	for (r = len; r > -9; r -= 64) {
 		if (r < 64)
 			memset(T, 0, 64);
 		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
 		if (r >= 0 && r < 64)
 			((unsigned char *)T)[r] = 0x80;
 		for (i = 0; i < 16; i++)
 			T[i] = be32dec(T + i);
 		if (r < 56)
 			T[15] = 8 * len;
 		sha256_transform(S, T, 0);
 	}
 	for (i = 0; i < 8; i++)
 		be32enc((uint32_t *)hash + i, S[i]);
 }
 static void sha256_hash512(unsigned char *hash, const unsigned char *data)
 {
 	uint32_t S[16], T[16];
 	int i;
 	sha256_init(S);
 	memcpy(T, data, 64);
 	for (i = 0; i < 16; i++)
 		T[i] = be32dec(T + i);
 	sha256_transform(S, T, 0);
 	memset(T, 0, 64);
 	//memcpy(T, data + 64, 0);
 	((unsigned char *)T)[0] = 0x80;
 	for (i = 0; i < 16; i++)
 		T[i] = be32dec(T + i);
 	T[15] = 8 * 64;
 	sha256_transform(S, T, 0);
 	for (i = 0; i < 8; i++)
 		be32enc((uint32_t *)hash + i, S[i]);
 }
 void pluckhash(uint32_t *hash, uint32_t *input)
 {
 	uint32_t data[20];
 	//uint32_t midstate[8];
 	const int HASH_MEMORY = 128 * 1024;
 	uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);
 	for (int k = 0; k<20; k++) { data[k] = input[k]; }
 		uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
 		int size = HASH_MEMORY;
 		memset(hashbuffer, 0, 64);
 		sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
 		for (int i = 64; i < size - 32; i += 32)
 		{
 			//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
 			int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
 			uint32_t joint[16];
 			uint32_t randbuffer[16];
 			uint32_t randseed[16];
 			memcpy(randseed, &hashbuffer[i - 64], 64);
 			if (i>128)
 			{
 				memcpy(randbuffer, &hashbuffer[i - 128], 64);
 			}
 			else
 			{
 				memset(&randbuffer, 0, 64);
 			}
 			xor_salsa8(randbuffer, randseed);
 			memcpy(joint, &hashbuffer[i - 32], 32);
 			//use the last hash value as the seed
 			for (int j = 32; j < 64; j += 4)
 			{
 				uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
 				joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
 			}
 			sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
 //			for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
 			memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
 			if (i>128)
 			{
 				memcpy(randbuffer, &hashbuffer[i - 128], 64);
 			}
 			else
 			{
 				memset(randbuffer, 0, 64);
 			}
 			xor_salsa8(randbuffer, randseed);
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
 				*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
 			}
 		}
 //		for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
 		//note: off-by-one error is likely here...
 /*
 		for (int i = size - 64 - 1; i >= 64; i -= 64)
 		{
 			sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
 		}
 		for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
 */
 		memcpy((unsigned char*)hash, hashbuffer, 32);
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 	int intensity = 18; /* beware > 20 could work and create diff problems later */
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
 	// divide by 128 for this algo which require a lot of memory
 	throughput = throughput / 128 - 256;
 	throughput = min(throughput, max_nonce - first_nonce + 1);
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		//cudaDeviceReset();
 		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 		cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);
 		pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
 		init[thr_id] = true;
 	}
 	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	pluck_setBlockTarget(endiandata,ptarget);
 	do {
 		uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
 		if (foundNonce != UINT32_MAX)
 		{
 //			const uint32_t Htarg = ptarget[7];
 //			uint32_t vhash64[8];
 //			be32enc(&endiandata[19], foundNonce);
 //			pluckhash(vhash64,endiandata);
 //			printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
 //			if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 				return 1;
 //			} else {
 //				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
 //			}
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }