diff --git a/Makefile.am b/Makefile.am
index 5483ac8..21641f9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,6 +46,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
+ pluck/pluck.cu pluck/cuda_pluck.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
diff --git a/ccminer.cpp b/ccminer.cpp
index 784b85c..daf472a 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -97,6 +97,7 @@ enum sha_algos {
ALGO_MYR_GR,
ALGO_NIST5,
ALGO_PENTABLAKE,
+ ALGO_PLUCK,
ALGO_QUARK,
ALGO_QUBIT,
ALGO_S3,
@@ -128,6 +129,7 @@ static const char *algo_names[] = {
"myr-gr",
"nist5",
"penta",
+ "pluck",
"quark",
"qubit",
"s3",
@@ -237,6 +239,7 @@ Options:\n\
myr-gr Myriad-Groestl\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
+ pluck SupCoin\n\
quark Quark\n\
qubit Qubit\n\
s3 S3 (1Coin)\n\
@@ -1059,6 +1062,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
switch (opt_algo) {
case ALGO_JACKPOT:
+ case ALGO_PLUCK:
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
break;
case ALGO_DMD_GR:
@@ -1272,6 +1276,9 @@ static void *miner_thread(void *userdata)
case ALGO_LYRA2:
minmax = 0x100000;
break;
+ case ALGO_PLUCK:
+ minmax = 0x2000;
+ break;
}
max64 = max(minmax-1, max64);
}
@@ -1397,6 +1404,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
+ case ALGO_PLUCK:
+ rc = scanhash_pluck(thr_id, work.data, work.target,
+ max_nonce, &hashes_done);
+ break;
+
case ALGO_S3:
rc = scanhash_s3(thr_id, work.data, work.target,
max_nonce, &hashes_done);
@@ -2237,8 +2249,8 @@ int main(int argc, char *argv[])
printf(" Built with the nVidia CUDA SDK 6.5\n\n");
#endif
printf(" Originally based on cudaminer by Christian Buchner and Christian H.,\n");
- printf(" Include some of djm34 additions and sp optimisations\n");
- printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
+ printf(" Include some work of djm34, sp, tsiv and klausT\n\n");
+ printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
rpc_user = strdup("");
rpc_pass = strdup("");
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 0d6f3ad..a49d90e 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -313,6 +313,7 @@
+
@@ -416,6 +417,8 @@
-Xptxas "-abi=yes" %(AdditionalOptions)
-Xptxas "-abi=yes" %(AdditionalOptions)
+
+
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index aafa68d..1625fa3 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -308,6 +308,9 @@
Header Files\CUDA
+
+ Header Files\CUDA
+
Header Files\sph
@@ -394,6 +397,12 @@
Source Files\CUDA\JHA
+
+ Source Files\CUDA
+
+
+ Source Files\CUDA
+
Source Files\CUDA\quark
@@ -573,4 +582,4 @@
Ressources
-
+
diff --git a/cuda_helper.h b/cuda_helper.h
index 892ae0a..6b1ce24 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -57,11 +57,13 @@ extern const uint3 threadIdx;
#endif
#if __CUDA_ARCH__ < 320
-// Kepler (Compute 3.0)
+// Host and Compute 3.0
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
-// Kepler (Compute 3.5, 5.0)
+// Compute 3.2+
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
#endif
__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
@@ -285,6 +287,45 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
return result;
}
+// device asm 32 for pluck
+__device__ __forceinline__
+uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
+ uint32_t result;
+ asm("{ .reg .u32 m,n,o;\n\t"
+ "and.b32 m, %1, %2;\n\t"
+ " or.b32 n, %1, %2;\n\t"
+ "and.b32 o, n, %3;\n\t"
+ " or.b32 %0, m, o ;\n\t"
+ "}\n\t"
+ : "=r"(result) : "r"(a), "r"(b), "r"(c));
+ return result;
+}
+
+__device__ __forceinline__
+uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
+ uint32_t result;
+ asm("{ .reg .u32 t1;\n\t"
+ "xor.b32 t1, %2, %3;\n\t"
+ "xor.b32 %0, %1, t1;\n\t"
+ "}"
+ : "=r"(result) : "r"(a) ,"r"(b),"r"(c));
+ return result;
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x,uint32_t n) {
+ uint32_t result;
+ asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+ return result;
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x,uint32_t n) {
+ uint32_t result;
+ asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+ return result;
+}
+
#ifndef USE_ROT_ASM_OPT
#define USE_ROT_ASM_OPT 1
#endif
diff --git a/cuda_vector.h b/cuda_vector.h
new file mode 100644
index 0000000..683b893
--- /dev/null
+++ b/cuda_vector.h
@@ -0,0 +1,245 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR "l"
+#else
+#define __LDG_PTR "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+typedef struct __align__(32) uint8
+{
+ unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) uint16
+{
+ union {
+ struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;};
+ uint8 lo;
+ };
+ union {
+ struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+ uint8 hi;
+ };
+} uint16;
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+ unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+ unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+ uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+ return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+ uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+ unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+ uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ return t;
+}
+
+
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+
+
+static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+ return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+ a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+ return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+ a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
+static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
+{
+ uint32_t ret;
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+ return ret;
+}
+
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+ uint32_t ret;
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+ return ret;
+}
+
+
+static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
+{
+ uint8 test;
+ asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+ asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
+{
+ uint64_t test;
+ asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 %0,{a,b,c,d}; }\n\t"
+ : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
+{
+ uint64_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d,e,f,g,h; \n\t"
+ ".reg .u32 i,j; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "ld.global.nc.u8 e,[%1+4]; \n\t"
+ "ld.global.nc.u8 f,[%1+5]; \n\t"
+ "ld.global.nc.u8 g,[%1+6]; \n\t"
+ "ld.global.nc.u8 h,[%1+7]; \n\t"
+ "mov.b32 i,{a,b,c,d}; \n\t"
+ "mov.b32 j,{e,f,g,h}; \n\t"
+ "mov.b64 %0,{i,j}; }\n\t"
+ : "=l"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
+{
+ uint32_t zero = 0;
+ uint64_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d; \n\t"
+ ".reg .u32 i; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 i,{a,b,c,d}; \n\t"
+ "mov.b64 %0,{i,%1}; }\n\t"
+ : "=l"(test) : __LDG_PTR(ptr), "r"(zero));
+ return (test);
+}
+
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm("{\n\t"
+ ".reg .u8 e,b,c,d; \n\t"
+ "ld.global.nc.u8 e,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 %0,{e,b,c,d}; }\n\t"
+ : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+ uint8 vec;
+ vec.s0 = cuda_swab32(buf[0].s0);
+ vec.s1 = cuda_swab32(buf[0].s1);
+ vec.s2 = cuda_swab32(buf[0].s2);
+ vec.s3 = cuda_swab32(buf[0].s3);
+ vec.s4 = cuda_swab32(buf[0].s4);
+ vec.s5 = cuda_swab32(buf[0].s5);
+ vec.s6 = cuda_swab32(buf[0].s6);
+ vec.s7 = cuda_swab32(buf[0].s7);
+ return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+ uint16 vec;
+ vec.s0 = cuda_swab32(buf[0].s0);
+ vec.s1 = cuda_swab32(buf[0].s1);
+ vec.s2 = cuda_swab32(buf[0].s2);
+ vec.s3 = cuda_swab32(buf[0].s3);
+ vec.s4 = cuda_swab32(buf[0].s4);
+ vec.s5 = cuda_swab32(buf[0].s5);
+ vec.s6 = cuda_swab32(buf[0].s6);
+ vec.s7 = cuda_swab32(buf[0].s7);
+ vec.s8 = cuda_swab32(buf[0].s8);
+ vec.s9 = cuda_swab32(buf[0].s9);
+ vec.sa = cuda_swab32(buf[0].sa);
+ vec.sb = cuda_swab32(buf[0].sb);
+ vec.sc = cuda_swab32(buf[0].sc);
+ vec.sd = cuda_swab32(buf[0].sd);
+ vec.se = cuda_swab32(buf[0].se);
+ vec.sf = cuda_swab32(buf[0].sf);
+ return vec;
+}
+#endif // #ifndef CUDA_VECTOR_H
diff --git a/miner.h b/miner.h
index 2a2e297..f3f51e4 100644
--- a/miner.h
+++ b/miner.h
@@ -334,6 +334,10 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
+extern int scanhash_pluck(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, uint32_t max_nonce,
+ unsigned long *hashes_done);
+
extern int scanhash_qubit(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
diff --git a/pluck/cuda_pluck.cu b/pluck/cuda_pluck.cu
new file mode 100644
index 0000000..0bbcbba
--- /dev/null
+++ b/pluck/cuda_pluck.cu
@@ -0,0 +1,574 @@
+/*
+ * "pluck" kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2015 djm34
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author djm34
+ * @author tpruvot
+ */
+
+#include
+#include
+#include
+
+#include "cuda_helper.h"
+#include "cuda_vector.h"
+
+uint32_t *d_PlNonce[MAX_GPUS];
+
+__device__ uint8_t * hashbuffer;
+__constant__ uint32_t pTarget[8];
+__constant__ uint32_t c_data[20];
+
+#define HASH_MEMORY_8bit 131072
+#define HASH_MEMORY_32bit 32768
+#define HASH_MEMORY 4096
+
+static __constant__ uint32_t H256[8] = {
+ 0x6A09E667, 0xBB67AE85, 0x3C6EF372,
+ 0xA54FF53A, 0x510E527F, 0x9B05688C,
+ 0x1F83D9AB, 0x5BE0CD19
+};
+
+static __constant__ uint32_t Ksha[64] = {
+ 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+ 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+ 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+ 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+ 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+ 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+ 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+ 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+ 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+ 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+ 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+ 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+ 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+ 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+ 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+ 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+
+#define SALSA(a,b,c,d) { \
+ t = a+d; b^=rotate(t, 7); \
+ t = b+a; c^=rotate(t, 9); \
+ t = c+b; d^=rotate(t, 13); \
+ t = d+c; a^=rotate(t, 18); \
+}
+
+#define SALSA_CORE(state) { \
+ SALSA(state.s0,state.s4,state.s8,state.sc); \
+ SALSA(state.s5,state.s9,state.sd,state.s1); \
+ SALSA(state.sa,state.se,state.s2,state.s6); \
+ SALSA(state.sf,state.s3,state.s7,state.sb); \
+ SALSA(state.s0,state.s1,state.s2,state.s3); \
+ SALSA(state.s5,state.s6,state.s7,state.s4); \
+ SALSA(state.sa,state.sb,state.s8,state.s9); \
+ SALSA(state.sf,state.sc,state.sd,state.se); \
+}
+
+static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx)
+{
+ uint32_t t;
+ uint16 state = Bx;
+ SALSA_CORE(state);
+ SALSA_CORE(state);
+ SALSA_CORE(state);
+ SALSA_CORE(state);
+ return(state+Bx);
+}
+
+
+// sha256
+
+static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+ uint32_t r1 = ROTR32(x, 2);
+ uint32_t r2 = ROTR32(x, 13);
+ uint32_t r3 = ROTR32(x, 22);
+ return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+ uint32_t r1 = ROTR32(x, 6);
+ uint32_t r2 = ROTR32(x, 11);
+ uint32_t r3 = ROTR32(x, 25);
+ return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+ uint64_t r1 = ROTR32(x, 7);
+ uint64_t r2 = ROTR32(x, 18);
+ uint64_t r3 = shr_t32(x, 3);
+ return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+ uint64_t r1 = ROTR32(x, 17);
+ uint64_t r2 = ROTR32(x, 19);
+ uint64_t r3 = shr_t32(x, 10);
+ return xor3b(r1, r2, r3);
+}
+
+static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
+ const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared)
+{
+ uint32_t t1, t2;
+ uint32_t vxandx = xandx(e, f, g);
+ uint32_t bsg21 = bsg2_1(e);
+ uint32_t bsg20 = bsg2_0(a);
+ uint32_t andorv = andor32(a, b, c);
+
+ t1 = h + bsg21 + vxandx + Kshared + in;
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
+ const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared)
+{
+ uint32_t t1, t2;
+
+ int pcidx1 = (pc - 2) & 0xF;
+ int pcidx2 = (pc - 7) & 0xF;
+ int pcidx3 = (pc - 15) & 0xF;
+ uint32_t inx0 = in[pc];
+ uint32_t inx1 = in[pcidx1];
+ uint32_t inx2 = in[pcidx2];
+ uint32_t inx3 = in[pcidx3];
+
+ uint32_t ssg21 = ssg2_1(inx1);
+ uint32_t ssg20 = ssg2_0(inx3);
+ uint32_t vxandx = xandx(e, f, g);
+ uint32_t bsg21 = bsg2_1(e);
+ uint32_t bsg20 = bsg2_0(a);
+ uint32_t andorv = andor32(a, b, c);
+
+ in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+ t1 = h + bsg21 + vxandx + Kshared + in[pc];
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+static __device__ __forceinline__
+void sha2_round_body(uint32_t* in, uint32_t* r)
+{
+ uint32_t a = r[0];
+ uint32_t b = r[1];
+ uint32_t c = r[2];
+ uint32_t d = r[3];
+ uint32_t e = r[4];
+ uint32_t f = r[5];
+ uint32_t g = r[6];
+ uint32_t h = r[7];
+
+ sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]);
+ sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]);
+ sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]);
+ sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]);
+ sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]);
+ sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]);
+ sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]);
+ sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]);
+ sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]);
+ sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]);
+ sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]);
+ sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]);
+ sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]);
+ sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]);
+ sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]);
+ sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]);
+
+ #pragma unroll 3
+ for (int i = 0; i<3; i++) {
+
+ sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]);
+ sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]);
+ sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]);
+ sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]);
+ sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]);
+ sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]);
+ sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]);
+ sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]);
+ sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]);
+ sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]);
+ sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]);
+ sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]);
+ sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]);
+ sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]);
+ sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]);
+ sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]);
+
+ }
+
+ r[0] += a;
+ r[1] += b;
+ r[2] += c;
+ r[3] += d;
+ r[4] += e;
+ r[5] += f;
+ r[6] += g;
+ r[7] += h;
+}
+
+
+static __device__ __forceinline__ uint8 sha256_64(uint32_t *data)
+{
+ uint32_t __align__(64) in[16];
+ uint32_t __align__(32) buf[8];
+
+ ((uint16 *)in)[0] = swapvec((uint16*)data);
+
+ ((uint8*)buf)[0] = ((uint8*)H256)[0];
+
+ sha2_round_body(in, buf);
+
+ #pragma unroll 14
+ for (int i = 0; i<14; i++) { in[i + 1] = 0; }
+
+ in[0] = 0x80000000;
+ in[15] = 0x200;
+
+ sha2_round_body(in, buf);
+ return swapvec((uint8*)buf);
+}
+
+static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce)
+{
+// uint32_t in[16], buf[8];
+ uint32_t __align__(64) in[16];
+ uint32_t __align__(32) buf[8];
+
+ ((uint16 *)in)[0] = swapvec((uint16*)c_data);
+ ((uint8*)buf)[0] = ((uint8*)H256)[0];
+
+ sha2_round_body(in, buf);
+
+ #pragma unroll 3
+ for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); }
+
+// in[3] = cuda_swab32(nonce);
+ in[3] = nonce;
+ in[4] = 0x80000000;
+ in[15] = 0x280;
+
+ #pragma unroll
+ for (int i = 5; i<15; i++) { in[i] = 0; }
+
+ sha2_round_body(in, buf);
+ return swapvec((uint8*)buf);
+}
+
+#define SHIFT 32 * 1024 * 4
+
+__global__ __launch_bounds__(256, 1)
+void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t shift = SHIFT * thread;
+ ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
+ ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
+ for (int i = 2; i < 5; i++)
+ {
+ uint32_t randmax = i * 32 - 4;
+ uint32_t randseed[16];
+ uint32_t randbuffer[16];
+ uint32_t joint[16];
+ uint8 Buffbuffer[2];
+
+ ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+ ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+// ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+ ((uint8*)joint)[0] = ((uint8*)randseed)[1];
+
+ #pragma unroll
+ for (int j = 0; j < 8; j++) {
+ uint32_t rand = randbuffer[j] % (randmax - 32);
+ joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+ }
+
+ uint8 truc = sha256_64(joint);
+ ((uint8*)(hashbuffer + shift))[i] = truc;
+ ((uint8*)randseed)[0] = ((uint8*)joint)[0];
+ ((uint8*)randseed)[1] = truc;
+
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+ for (int j = 0; j < 32; j += 2)
+ {
+ uint32_t rand = randbuffer[j / 2] % randmax;
+ (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+ (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+ (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+ (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+ }
+
+ } // main loop
+ }
+}
+
+__global__ __launch_bounds__(256, 1)
+void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t shift = SHIFT * thread;
+
+ for (int i = 5; i < HASH_MEMORY - 1; i++)
+ {
+ uint32_t randmax = i*32-4;
+ uint32_t randseed[16];
+ uint32_t randbuffer[16];
+ uint32_t joint[16];
+ uint8 Buffbuffer[2];
+
+ ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]);
+ ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]);
+
+
+ Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]);
+ Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]);
+
+ ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+ ((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]);
+ ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]);
+
+ #pragma unroll
+ for (int j = 0; j < 8; j++) {
+ uint32_t rand = randbuffer[j] % (randmax - 32);
+ joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+ }
+
+ uint8 truc = sha256_64(joint);
+ ((uint8*)(hashbuffer + shift))[i] = truc;
+ ((uint8*)randseed)[0] = ((uint8*)joint)[0];
+ ((uint8*)randseed)[1] = truc;
+
+ ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+ for (int j = 0; j < 32; j += 2)
+ {
+ uint32_t rand = randbuffer[j / 2] % randmax;
+
+ (hashbuffer+shift)[rand] = __ldg(&(hashbuffer+shift)[randmax+j]);
+ (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+ (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+ (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+ }
+
+ } // main loop
+
+ uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
+
+ if (outbuf <= pTarget[7]) {
+ nonceVector[0] = nonce;
+ }
+
+ }
+}
+
+__global__ __launch_bounds__(128, 3)
+void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t shift = SHIFT * thread;
+ ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
+ ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
+ for (int i = 2; i < 5; i++)
+ {
+ uint32_t randmax = i * 32 - 4;
+ uint32_t randseed[16];
+ uint32_t randbuffer[16];
+ uint32_t joint[16];
+ uint8 Buffbuffer[2];
+
+ ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+ ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+// ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+ ((uint8*)joint)[0] = ((uint8*)randseed)[1];
+
+ #pragma unroll
+ for (int j = 0; j < 8; j++) {
+ uint32_t rand = randbuffer[j] % (randmax - 32);
+ joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+ }
+
+ uint8 truc = sha256_64(joint);
+ ((uint8*)(hashbuffer + shift))[i] = truc;
+ ((uint8*)randseed)[0] = ((uint8*)joint)[0];
+ ((uint8*)randseed)[1] = truc;
+
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+ for (int j = 0; j < 32; j += 2)
+ {
+ uint32_t rand = randbuffer[j / 2] % randmax;
+ (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+ (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+ (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+ (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+ }
+
+ } // main loop
+
+ }
+}
+
+__global__ __launch_bounds__(128, 3)
+void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t shift = SHIFT * thread;
+
+ for (int i = 5; i < HASH_MEMORY - 1; i++)
+ {
+ uint32_t randmax = i * 32 - 4;
+ uint32_t randseed[16];
+ uint32_t randbuffer[16];
+ uint32_t joint[16];
+ uint8 Buffbuffer[2];
+
+ ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
+ ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
+
+
+ Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]);
+ Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]);
+ ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+ ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
+
+ #pragma unroll
+ for (int j = 0; j < 8; j++)
+ {
+ uint32_t rand = randbuffer[j] % (randmax - 32);
+ joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
+ }
+
+ uint8 truc = sha256_64(joint);
+ ((uint8*)(hashbuffer + shift))[i] = truc;
+ ((uint8*)randseed)[0] = ((uint8*)joint)[0];
+ ((uint8*)randseed)[1] = truc;
+
+
+ ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
+ ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
+
+ for (int j = 0; j < 32; j += 2)
+ {
+ uint32_t rand = randbuffer[j / 2] % randmax;
+
+ (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
+ (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
+ (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
+ (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
+ }
+
+ } // main loop
+
+ uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
+
+ if (outbuf <= pTarget[7]) {
+ nonceVector[0] = nonce;
+ }
+
+ }
+}
+
+void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
+{
+ cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
+ cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t));
+}
+
+__host__
+uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order)
+{
+ uint32_t result[8] = {0xffffffff};
+ cudaMemset(d_PlNonce[thr_id], 0xffffffff, sizeof(uint32_t));
+
+ const uint32_t threadsperblock = 128;
+
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+ dim3 grid50((threads + 256 - 1) / 256);
+ dim3 block50(256);
+
+ if (device_sm[device_map[thr_id]] >= 500) {
+ pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
+ pluck_gpu_hash_v50 <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
+ } else {
+ pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce);
+ pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
+ }
+
+ MyStreamSynchronize(NULL, order, thr_id);
+ cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+ return result[thr_id];
+}
+
+__host__
+void pluck_setBlockTarget(const void *pdata, const void *ptarget)
+{
+ unsigned char PaddedMessage[80];
+ memcpy(PaddedMessage, pdata, 80);
+
+ cudaMemcpyToSymbol(c_data, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
diff --git a/pluck/pluck.cu b/pluck/pluck.cu
new file mode 100644
index 0000000..813a214
--- /dev/null
+++ b/pluck/pluck.cu
@@ -0,0 +1,270 @@
+/* Based on djm code */
+
+extern "C" {
+#include "miner.h"
+}
+
+#include
+
+static uint32_t *d_hash[MAX_GPUS] ;
+
+extern void pluck_setBlockTarget(const void* data, const void *ptarget);
+extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
+extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);
+
+extern float tp_coef[MAX_GPUS];
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+//note, this is 64 bytes
+static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
+{
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+ uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
+ int i;
+
+ x00 = (B[0] ^= Bx[0]);
+ x01 = (B[1] ^= Bx[1]);
+ x02 = (B[2] ^= Bx[2]);
+ x03 = (B[3] ^= Bx[3]);
+ x04 = (B[4] ^= Bx[4]);
+ x05 = (B[5] ^= Bx[5]);
+ x06 = (B[6] ^= Bx[6]);
+ x07 = (B[7] ^= Bx[7]);
+ x08 = (B[8] ^= Bx[8]);
+ x09 = (B[9] ^= Bx[9]);
+ x10 = (B[10] ^= Bx[10]);
+ x11 = (B[11] ^= Bx[11]);
+ x12 = (B[12] ^= Bx[12]);
+ x13 = (B[13] ^= Bx[13]);
+ x14 = (B[14] ^= Bx[14]);
+ x15 = (B[15] ^= Bx[15]);
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on columns. */
+ x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7);
+ x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7);
+
+ x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9);
+ x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9);
+
+ x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13);
+ x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13);
+
+ x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18);
+ x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18);
+
+ /* Operate on rows. */
+ x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7);
+ x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7);
+
+ x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9);
+ x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9);
+
+ x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13);
+ x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13);
+
+ x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18);
+ x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18);
+ }
+ B[0] += x00;
+ B[1] += x01;
+ B[2] += x02;
+ B[3] += x03;
+ B[4] += x04;
+ B[5] += x05;
+ B[6] += x06;
+ B[7] += x07;
+ B[8] += x08;
+ B[9] += x09;
+ B[10] += x10;
+ B[11] += x11;
+ B[12] += x12;
+ B[13] += x13;
+ B[14] += x14;
+ B[15] += x15;
+#undef ROTL
+}
+
+static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
+{
+ uint32_t S[16], T[16];
+ int i, r;
+
+ sha256_init(S);
+ for (r = len; r > -9; r -= 64) {
+ if (r < 64)
+ memset(T, 0, 64);
+ memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+ if (r >= 0 && r < 64)
+ ((unsigned char *)T)[r] = 0x80;
+ for (i = 0; i < 16; i++)
+ T[i] = be32dec(T + i);
+
+ if (r < 56)
+ T[15] = 8 * len;
+ sha256_transform(S, T, 0);
+ }
+ for (i = 0; i < 8; i++)
+ be32enc((uint32_t *)hash + i, S[i]);
+}
+
+static void sha256_hash512(unsigned char *hash, const unsigned char *data)
+{
+ uint32_t S[16], T[16];
+ int i;
+
+ sha256_init(S);
+
+ memcpy(T, data, 64);
+ for (i = 0; i < 16; i++)
+ T[i] = be32dec(T + i);
+ sha256_transform(S, T, 0);
+
+ memset(T, 0, 64);
+ //memcpy(T, data + 64, 0);
+ ((unsigned char *)T)[0] = 0x80;
+ for (i = 0; i < 16; i++)
+ T[i] = be32dec(T + i);
+ T[15] = 8 * 64;
+ sha256_transform(S, T, 0);
+
+ for (i = 0; i < 8; i++)
+ be32enc((uint32_t *)hash + i, S[i]);
+}
+
+void pluckhash(uint32_t *hash, uint32_t *input)
+{
+
+ uint32_t data[20];
+ //uint32_t midstate[8];
+
+ const int HASH_MEMORY = 128 * 1024;
+ uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);
+
+ for (int k = 0; k<20; k++) { data[k] = input[k]; }
+
+ uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
+ int size = HASH_MEMORY;
+ memset(hashbuffer, 0, 64);
+
+ sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
+ for (int i = 64; i < size - 32; i += 32)
+ {
+ //i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
+ int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
+ uint32_t joint[16];
+ uint32_t randbuffer[16];
+
+ uint32_t randseed[16];
+ memcpy(randseed, &hashbuffer[i - 64], 64);
+ if (i>128)
+ {
+ memcpy(randbuffer, &hashbuffer[i - 128], 64);
+ }
+ else
+ {
+ memset(&randbuffer, 0, 64);
+ }
+
+ xor_salsa8(randbuffer, randseed);
+
+ memcpy(joint, &hashbuffer[i - 32], 32);
+ //use the last hash value as the seed
+ for (int j = 32; j < 64; j += 4)
+ {
+ uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
+ joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
+ }
+ sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
+// for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
+ memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
+ if (i>128)
+ {
+ memcpy(randbuffer, &hashbuffer[i - 128], 64);
+ }
+ else
+ {
+ memset(randbuffer, 0, 64);
+ }
+ xor_salsa8(randbuffer, randseed);
+ for (int j = 0; j < 32; j += 2)
+ {
+ uint32_t rand = randbuffer[j / 2] % randmax;
+ *((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
+ }
+ }
+
+// for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
+
+ //note: off-by-one error is likely here...
+/*
+ for (int i = size - 64 - 1; i >= 64; i -= 64)
+ {
+ sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
+ }
+
+ for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
+*/
+ memcpy((unsigned char*)hash, hashbuffer, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+ uint32_t max_nonce, unsigned long *hashes_done)
+{
+ const uint32_t first_nonce = pdata[19];
+ uint32_t endiandata[20];
+
+ int intensity = 18; /* beware > 20 could work and create diff problems later */
+ uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
+ // divide by 128 for this algo which require a lot of memory
+ throughput = throughput / 128 - 256;
+ throughput = min(throughput, max_nonce - first_nonce + 1);
+
+ if (opt_benchmark)
+ ((uint32_t*)ptarget)[7] = 0x0000ff;
+
+ if (!init[thr_id])
+ {
+ cudaSetDevice(device_map[thr_id]);
+ //cudaDeviceReset();
+ //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ //cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+ cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);
+
+ pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
+ init[thr_id] = true;
+ }
+
+
+ for (int k = 0; k < 20; k++)
+ be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+ pluck_setBlockTarget(endiandata,ptarget);
+
+ do {
+ uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
+ if (foundNonce != UINT32_MAX)
+ {
+// const uint32_t Htarg = ptarget[7];
+// uint32_t vhash64[8];
+// be32enc(&endiandata[19], foundNonce);
+// pluckhash(vhash64,endiandata);
+// printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
+// if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
+ *hashes_done = pdata[19] - first_nonce + throughput;
+ pdata[19] = foundNonce;
+ return 1;
+// } else {
+// applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+// }
+ }
+
+ pdata[19] += throughput;
+
+ } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+ *hashes_done = pdata[19] - first_nonce;
+ return 0;
+}