From 7939dce0aab507eccd18541b3285adca15a438fa Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 8 Mar 2015 14:51:40 +0100 Subject: [PATCH] pluck: adaptation from djm repo remains the cpu validation check to do... throughput for this algo is divided by 128 to keep same kind of intensity values (default 18.0) --- Makefile.am | 1 + ccminer.cpp | 16 +- ccminer.vcxproj | 3 + ccminer.vcxproj.filters | 11 +- cuda_helper.h | 45 +++- cuda_vector.h | 245 +++++++++++++++++ miner.h | 4 + pluck/cuda_pluck.cu | 574 ++++++++++++++++++++++++++++++++++++++++ pluck/pluck.cu | 270 +++++++++++++++++++ 9 files changed, 1164 insertions(+), 5 deletions(-) create mode 100644 cuda_vector.h create mode 100644 pluck/cuda_pluck.cu create mode 100644 pluck/pluck.cu diff --git a/Makefile.am b/Makefile.am index 5483ac8..21641f9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,6 +46,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ + pluck/pluck.cu pluck/cuda_pluck.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ diff --git a/ccminer.cpp b/ccminer.cpp index 784b85c..daf472a 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -97,6 +97,7 @@ enum sha_algos { ALGO_MYR_GR, ALGO_NIST5, ALGO_PENTABLAKE, + ALGO_PLUCK, ALGO_QUARK, ALGO_QUBIT, ALGO_S3, @@ -128,6 +129,7 @@ static const char *algo_names[] = { "myr-gr", "nist5", "penta", + "pluck", "quark", "qubit", "s3", @@ -237,6 +239,7 @@ Options:\n\ myr-gr Myriad-Groestl\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ + pluck SupCoin\n\ quark Quark\n\ qubit Qubit\n\ s3 S3 (1Coin)\n\ @@ -1059,6 +1062,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) switch (opt_algo) { case ALGO_JACKPOT: + case ALGO_PLUCK: diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); break; case ALGO_DMD_GR: @@ -1272,6 +1276,9 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2: minmax = 0x100000; break; + case ALGO_PLUCK: + minmax = 0x2000; + break; } max64 = max(minmax-1, max64); } @@ -1397,6 +1404,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_PLUCK: + rc = scanhash_pluck(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_S3: rc = scanhash_s3(thr_id, work.data, work.target, max_nonce, &hashes_done); @@ -2237,8 +2249,8 @@ int main(int argc, char *argv[]) printf(" Built with the nVidia CUDA SDK 6.5\n\n"); #endif printf(" Originally based on cudaminer by Christian Buchner and Christian H.,\n"); - printf(" Include some of djm34 additions and sp optimisations\n"); - printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n"); + printf(" Include some work of djm34, sp, tsiv and klausT\n\n"); + printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n"); rpc_user = strdup(""); rpc_pass = strdup(""); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 0d6f3ad..a49d90e 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -313,6 +313,7 @@ + @@ -416,6 +417,8 @@ -Xptxas "-abi=yes" %(AdditionalOptions) -Xptxas "-abi=yes" %(AdditionalOptions) + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index aafa68d..1625fa3 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -308,6 +308,9 @@ Header Files\CUDA + + Header Files\CUDA + Header Files\sph @@ -394,6 +397,12 @@ Source Files\CUDA\JHA + + Source Files\CUDA + + + Source Files\CUDA + Source Files\CUDA\quark @@ -573,4 +582,4 @@ Ressources - + diff --git a/cuda_helper.h b/cuda_helper.h index 892ae0a..6b1ce24 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -57,11 +57,13 @@ extern const uint3 threadIdx; #endif #if __CUDA_ARCH__ < 320 -// Kepler (Compute 3.0) +// Host and Compute 3.0 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #else -// Kepler (Compute 3.5, 5.0) +// Compute 3.2+ #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) +#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) #endif __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) @@ -285,6 +287,45 @@ uint64_t shl_t64(uint64_t x, uint32_t n) return result; } +// device asm 32 for pluck +__device__ __forceinline__ +uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} + +__device__ __forceinline__ +uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) : "r"(a) ,"r"(b),"r"(c)); + return result; +} + +__device__ __forceinline__ +uint32_t shr_t32(uint32_t x,uint32_t n) { + uint32_t result; + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +__device__ __forceinline__ +uint32_t shl_t32(uint32_t x,uint32_t n) { + uint32_t result; + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + #ifndef USE_ROT_ASM_OPT #define USE_ROT_ASM_OPT 1 #endif diff --git a/cuda_vector.h b/cuda_vector.h new file mode 100644 index 0000000..683b893 --- /dev/null +++ b/cuda_vector.h @@ -0,0 +1,245 @@ +#ifndef CUDA_VECTOR_H +#define CUDA_VECTOR_H + + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +//typedef __device_builtin__ struct ulong16 ulong16; + +typedef struct __align__(32) uint8 +{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) uint16 +{ + union { + struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;}; + uint8 lo; + }; + union { + struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;}; + uint8 hi; + }; +} uint16; + + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo=a; t.hi=b; return t; +} + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + +static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } + + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } + + +static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + +static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + +static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{a,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d,e,f,g,h; \n\t" + ".reg .u32 i,j; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "ld.global.nc.u8 e,[%1+4]; \n\t" + "ld.global.nc.u8 f,[%1+5]; \n\t" + "ld.global.nc.u8 g,[%1+6]; \n\t" + "ld.global.nc.u8 h,[%1+7]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b32 j,{e,f,g,h}; \n\t" + "mov.b64 %0,{i,j}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr) +{ + uint32_t zero = 0; + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + ".reg .u32 i; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b64 %0,{i,%1}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr), "r"(zero)); + return (test); +} + + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr) +{ + uint32_t test; + asm("{\n\t" + ".reg .u8 e,b,c,d; \n\t" + "ld.global.nc.u8 e,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{e,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + + + + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} +#endif // #ifndef CUDA_VECTOR_H diff --git a/miner.h b/miner.h index 2a2e297..f3f51e4 100644 --- a/miner.h +++ b/miner.h @@ -334,6 +334,10 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_pluck(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_qubit(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); diff --git a/pluck/cuda_pluck.cu b/pluck/cuda_pluck.cu new file mode 100644 index 0000000..0bbcbba --- /dev/null +++ b/pluck/cuda_pluck.cu @@ -0,0 +1,574 @@ +/* + * "pluck" kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2015 djm34 + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34 + * @author tpruvot + */ + +#include +#include +#include + +#include "cuda_helper.h" +#include "cuda_vector.h" + +uint32_t *d_PlNonce[MAX_GPUS]; + +__device__ uint8_t * hashbuffer; +__constant__ uint32_t pTarget[8]; +__constant__ uint32_t c_data[20]; + +#define HASH_MEMORY_8bit 131072 +#define HASH_MEMORY_32bit 32768 +#define HASH_MEMORY 4096 + +static __constant__ uint32_t H256[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +static __constant__ uint32_t Ksha[64] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 +}; + + +#define SALSA(a,b,c,d) { \ + t = a+d; b^=rotate(t, 7); \ + t = b+a; c^=rotate(t, 9); \ + t = c+b; d^=rotate(t, 13); \ + t = d+c; a^=rotate(t, 18); \ +} + +#define SALSA_CORE(state) { \ + SALSA(state.s0,state.s4,state.s8,state.sc); \ + SALSA(state.s5,state.s9,state.sd,state.s1); \ + SALSA(state.sa,state.se,state.s2,state.s6); \ + SALSA(state.sf,state.s3,state.s7,state.sb); \ + SALSA(state.s0,state.s1,state.s2,state.s3); \ + SALSA(state.s5,state.s6,state.s7,state.s4); \ + SALSA(state.sa,state.sb,state.s8,state.s9); \ + SALSA(state.sf,state.sc,state.sd,state.se); \ +} + +static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx) +{ + uint32_t t; + uint16 state = Bx; + SALSA_CORE(state); + SALSA_CORE(state); + SALSA_CORE(state); + SALSA_CORE(state); + return(state+Bx); +} + + +// sha256 + +static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) +{ + uint32_t r1 = ROTR32(x, 2); + uint32_t r2 = ROTR32(x, 13); + uint32_t r3 = ROTR32(x, 22); + return xor3b(r1, r2, r3); +} + +static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x) +{ + uint32_t r1 = ROTR32(x, 6); + uint32_t r2 = ROTR32(x, 11); + uint32_t r3 = ROTR32(x, 25); + return xor3b(r1, r2, r3); +} + +static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x) +{ + uint64_t r1 = ROTR32(x, 7); + uint64_t r2 = ROTR32(x, 18); + uint64_t r3 = shr_t32(x, 3); + return xor3b(r1, r2, r3); +} + +static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x) +{ + uint64_t r1 = ROTR32(x, 17); + uint64_t r2 = ROTR32(x, 19); + uint64_t r3 = shr_t32(x, 10); + return xor3b(r1, r2, r3); +} + +static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e, + const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared) +{ + uint32_t t1, t2; + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a, b, c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e, + const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared) +{ + uint32_t t1, t2; + + int pcidx1 = (pc - 2) & 0xF; + int pcidx2 = (pc - 7) & 0xF; + int pcidx3 = (pc - 15) & 0xF; + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ssg2_1(inx1); + uint32_t ssg20 = ssg2_0(inx3); + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a, b, c); + + in[pc] = ssg21 + inx2 + ssg20 + inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +static __device__ __forceinline__ +void sha2_round_body(uint32_t* in, uint32_t* r) +{ + uint32_t a = r[0]; + uint32_t b = r[1]; + uint32_t c = r[2]; + uint32_t d = r[3]; + uint32_t e = r[4]; + uint32_t f = r[5]; + uint32_t g = r[6]; + uint32_t h = r[7]; + + sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]); + sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]); + sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]); + sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]); + sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]); + sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]); + sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]); + sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]); + sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]); + sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]); + sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]); + sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]); + sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]); + sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]); + sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]); + sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]); + + #pragma unroll 3 + for (int i = 0; i<3; i++) { + + sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]); + sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]); + sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]); + sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]); + sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]); + sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]); + sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]); + sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]); + sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]); + sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]); + sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]); + sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]); + sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]); + sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]); + sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]); + sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]); + + } + + r[0] += a; + r[1] += b; + r[2] += c; + r[3] += d; + r[4] += e; + r[5] += f; + r[6] += g; + r[7] += h; +} + + +static __device__ __forceinline__ uint8 sha256_64(uint32_t *data) +{ + uint32_t __align__(64) in[16]; + uint32_t __align__(32) buf[8]; + + ((uint16 *)in)[0] = swapvec((uint16*)data); + + ((uint8*)buf)[0] = ((uint8*)H256)[0]; + + sha2_round_body(in, buf); + + #pragma unroll 14 + for (int i = 0; i<14; i++) { in[i + 1] = 0; } + + in[0] = 0x80000000; + in[15] = 0x200; + + sha2_round_body(in, buf); + return swapvec((uint8*)buf); +} + +static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce) +{ +// uint32_t in[16], buf[8]; + uint32_t __align__(64) in[16]; + uint32_t __align__(32) buf[8]; + + ((uint16 *)in)[0] = swapvec((uint16*)c_data); + ((uint8*)buf)[0] = ((uint8*)H256)[0]; + + sha2_round_body(in, buf); + + #pragma unroll 3 + for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); } + +// in[3] = cuda_swab32(nonce); + in[3] = nonce; + in[4] = 0x80000000; + in[15] = 0x280; + + #pragma unroll + for (int i = 5; i<15; i++) { in[i] = 0; } + + sha2_round_body(in, buf); + return swapvec((uint8*)buf); +} + +#define SHIFT 32 * 1024 * 4 + +__global__ __launch_bounds__(256, 1) +void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t shift = SHIFT * thread; + ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce); + ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0); + for (int i = 2; i < 5; i++) + { + uint32_t randmax = i * 32 - 4; + uint32_t randseed[16]; + uint32_t randbuffer[16]; + uint32_t joint[16]; + uint8 Buffbuffer[2]; + + ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); + ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + +// ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]); + ((uint8*)joint)[0] = ((uint8*)randseed)[1]; + + #pragma unroll + for (int j = 0; j < 8; j++) { + uint32_t rand = randbuffer[j] % (randmax - 32); + joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); + } + + uint8 truc = sha256_64(joint); + ((uint8*)(hashbuffer + shift))[i] = truc; + ((uint8*)randseed)[0] = ((uint8*)joint)[0]; + ((uint8*)randseed)[1] = truc; + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + for (int j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); + (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); + (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); + (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); + } + + } // main loop + } +} + +__global__ __launch_bounds__(256, 1) +void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) +{ + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t shift = SHIFT * thread; + + for (int i = 5; i < HASH_MEMORY - 1; i++) + { + uint32_t randmax = i*32-4; + uint32_t randseed[16]; + uint32_t randbuffer[16]; + uint32_t joint[16]; + uint8 Buffbuffer[2]; + + ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]); + ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]); + + + Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]); + Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]); + + ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; + ((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]); + ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]); + + #pragma unroll + for (int j = 0; j < 8; j++) { + uint32_t rand = randbuffer[j] % (randmax - 32); + joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); + } + + uint8 truc = sha256_64(joint); + ((uint8*)(hashbuffer + shift))[i] = truc; + ((uint8*)randseed)[0] = ((uint8*)joint)[0]; + ((uint8*)randseed)[1] = truc; + + ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + for (int j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + + (hashbuffer+shift)[rand] = __ldg(&(hashbuffer+shift)[randmax+j]); + (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); + (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); + (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); + } + + } // main loop + + uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]); + + if (outbuf <= pTarget[7]) { + nonceVector[0] = nonce; + } + + } +} + +__global__ __launch_bounds__(128, 3) +void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t shift = SHIFT * thread; + ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce); + ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0); + for (int i = 2; i < 5; i++) + { + uint32_t randmax = i * 32 - 4; + uint32_t randseed[16]; + uint32_t randbuffer[16]; + uint32_t joint[16]; + uint8 Buffbuffer[2]; + + ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); + ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + +// ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]); + ((uint8*)joint)[0] = ((uint8*)randseed)[1]; + + #pragma unroll + for (int j = 0; j < 8; j++) { + uint32_t rand = randbuffer[j] % (randmax - 32); + joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); + } + + uint8 truc = sha256_64(joint); + ((uint8*)(hashbuffer + shift))[i] = truc; + ((uint8*)randseed)[0] = ((uint8*)joint)[0]; + ((uint8*)randseed)[1] = truc; + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + for (int j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); + (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); + (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); + (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); + } + + } // main loop + + } +} + +__global__ __launch_bounds__(128, 3) +void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t shift = SHIFT * thread; + + for (int i = 5; i < HASH_MEMORY - 1; i++) + { + uint32_t randmax = i * 32 - 4; + uint32_t randseed[16]; + uint32_t randbuffer[16]; + uint32_t joint[16]; + uint8 Buffbuffer[2]; + + ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); + ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); + + + Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]); + Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]); + ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; + + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]); + + #pragma unroll + for (int j = 0; j < 8; j++) + { + uint32_t rand = randbuffer[j] % (randmax - 32); + joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); + } + + uint8 truc = sha256_64(joint); + ((uint8*)(hashbuffer + shift))[i] = truc; + ((uint8*)randseed)[0] = ((uint8*)joint)[0]; + ((uint8*)randseed)[1] = truc; + + + ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; + ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); + + for (int j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + + (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); + (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); + (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); + (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); + } + + } // main loop + + uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]); + + if (outbuf <= pTarget[7]) { + nonceVector[0] = nonce; + } + + } +} + +void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash) +{ + cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); + cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t)); +} + +__host__ +uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order) +{ + uint32_t result[8] = {0xffffffff}; + cudaMemset(d_PlNonce[thr_id], 0xffffffff, sizeof(uint32_t)); + + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + dim3 grid50((threads + 256 - 1) / 256); + dim3 block50(256); + + if (device_sm[device_map[thr_id]] >= 500) { + pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce); + pluck_gpu_hash_v50 <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]); + } else { + pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce); + pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]); + } + + MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + return result[thr_id]; +} + +__host__ +void pluck_setBlockTarget(const void *pdata, const void *ptarget) +{ + unsigned char PaddedMessage[80]; + memcpy(PaddedMessage, pdata, 80); + + cudaMemcpyToSymbol(c_data, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} diff --git a/pluck/pluck.cu b/pluck/pluck.cu new file mode 100644 index 0000000..813a214 --- /dev/null +++ b/pluck/pluck.cu @@ -0,0 +1,270 @@ +/* Based on djm code */ + +extern "C" { +#include "miner.h" +} + +#include + +static uint32_t *d_hash[MAX_GPUS] ; + +extern void pluck_setBlockTarget(const void* data, const void *ptarget); +extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash); +extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order); + +extern float tp_coef[MAX_GPUS]; + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) +//note, this is 64 bytes +static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) +{ +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15; + int i; + + x00 = (B[0] ^= Bx[0]); + x01 = (B[1] ^= Bx[1]); + x02 = (B[2] ^= Bx[2]); + x03 = (B[3] ^= Bx[3]); + x04 = (B[4] ^= Bx[4]); + x05 = (B[5] ^= Bx[5]); + x06 = (B[6] ^= Bx[6]); + x07 = (B[7] ^= Bx[7]); + x08 = (B[8] ^= Bx[8]); + x09 = (B[9] ^= Bx[9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { + /* Operate on columns. */ + x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); + x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); + + x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); + x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); + + x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); + x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); + + x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); + x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); + + /* Operate on rows. */ + x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); + x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); + + x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); + x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); + + x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); + x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); + + x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); + x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); + } + B[0] += x00; + B[1] += x01; + B[2] += x02; + B[3] += x03; + B[4] += x04; + B[5] += x05; + B[6] += x06; + B[7] += x07; + B[8] += x08; + B[9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +#undef ROTL +} + +static void sha256_hash(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, S[i]); +} + +static void sha256_hash512(unsigned char *hash, const unsigned char *data) +{ + uint32_t S[16], T[16]; + int i; + + sha256_init(S); + + memcpy(T, data, 64); + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + sha256_transform(S, T, 0); + + memset(T, 0, 64); + //memcpy(T, data + 64, 0); + ((unsigned char *)T)[0] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + T[15] = 8 * 64; + sha256_transform(S, T, 0); + + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, S[i]); +} + +void pluckhash(uint32_t *hash, uint32_t *input) +{ + + uint32_t data[20]; + //uint32_t midstate[8]; + + const int HASH_MEMORY = 128 * 1024; + uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY); + + for (int k = 0; k<20; k++) { data[k] = input[k]; } + + uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge.. + int size = HASH_MEMORY; + memset(hashbuffer, 0, 64); + + sha256_hash(&hashbuffer[0], (uint8_t*)data, 80); + for (int i = 64; i < size - 32; i += 32) + { + //i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area + int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases + uint32_t joint[16]; + uint32_t randbuffer[16]; + + uint32_t randseed[16]; + memcpy(randseed, &hashbuffer[i - 64], 64); + if (i>128) + { + memcpy(randbuffer, &hashbuffer[i - 128], 64); + } + else + { + memset(&randbuffer, 0, 64); + } + + xor_salsa8(randbuffer, randseed); + + memcpy(joint, &hashbuffer[i - 32], 32); + //use the last hash value as the seed + for (int j = 32; j < 64; j += 4) + { + uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to + joint[j / 4] = *((uint32_t*)&hashbuffer[rand]); + } + sha256_hash512(&hashbuffer[i], (uint8_t*)joint); +// for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); } + memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing) + if (i>128) + { + memcpy(randbuffer, &hashbuffer[i - 128], 64); + } + else + { + memset(randbuffer, 0, 64); + } + xor_salsa8(randbuffer, randseed); + for (int j = 0; j < 32; j += 2) + { + uint32_t rand = randbuffer[j / 2] % randmax; + *((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]); + } + } + +// for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); } + + //note: off-by-one error is likely here... +/* + for (int i = size - 64 - 1; i >= 64; i -= 64) + { + sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]); + } + + for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); } +*/ + memcpy((unsigned char*)hash, hashbuffer, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + uint32_t endiandata[20]; + + int intensity = 18; /* beware > 20 could work and create diff problems later */ + uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); + // divide by 128 for this algo which require a lot of memory + throughput = throughput / 128 - 256; + throughput = min(throughput, max_nonce - first_nonce + 1); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + //cudaDeviceReset(); + //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + //cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + + cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput); + + pluck_cpu_init(thr_id, throughput, d_hash[thr_id]); + init[thr_id] = true; + } + + + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + pluck_setBlockTarget(endiandata,ptarget); + + do { + uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0); + if (foundNonce != UINT32_MAX) + { +// const uint32_t Htarg = ptarget[7]; +// uint32_t vhash64[8]; +// be32enc(&endiandata[19], foundNonce); +// pluckhash(vhash64,endiandata); +// printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]); +// if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) { + *hashes_done = pdata[19] - first_nonce + throughput; + pdata[19] = foundNonce; + return 1; +// } else { +// applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); +// } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +}