From 15293d063feaa049b24c7f079b3d7b95cd1428a5 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 28 Jun 2015 20:45:38 +0200 Subject: [PATCH] remove pluck algo Supcoin seems.... dead and the algo was not supported on all devices --- Makefile.am | 1 - README.txt | 5 +- ccminer.cpp | 13 - ccminer.vcxproj | 2 - ccminer.vcxproj.filters | 6 - cuda_helper.h | 35 --- miner.h | 5 - pluck/cuda_pluck.cu | 573 ---------------------------------------- pluck/pluck.cu | 240 ----------------- util.cpp | 3 - 10 files changed, 4 insertions(+), 879 deletions(-) delete mode 100644 pluck/cuda_pluck.cu delete mode 100644 pluck/pluck.cu diff --git a/Makefile.am b/Makefile.am index b38738b..f5c0a0f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -47,7 +47,6 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ - pluck/pluck.cu pluck/cuda_pluck.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ diff --git a/README.txt b/README.txt index d748cae..b4d38c7 100644 --- a/README.txt +++ b/README.txt @@ -81,7 +81,6 @@ its command line interface and options. neoscrypt use to mine FeatherCoin nist5 use to mine TalkCoin penta use to mine Joincoin / Pentablake - pluck use to mine Supcoin quark use to mine Quarkcoin qubit use to mine Qubit scrypt use to mine Scrypt coins @@ -221,6 +220,10 @@ features. >>> RELEASE HISTORY <<< + July 2015... + Nvml api power limits + Remove pluck algo + June 23th 2015 v1.6.5 Handle Ziftrcoin PoK solo mining Basic compatibility with CUDA 7.0 (generally slower hashrate) diff --git a/ccminer.cpp b/ccminer.cpp index cca3b50..0bc893b 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -102,7 +102,6 @@ enum sha_algos { ALGO_NEOSCRYPT, ALGO_NIST5, ALGO_PENTABLAKE, - ALGO_PLUCK, ALGO_QUARK, ALGO_QUBIT, ALGO_SCRYPT, @@ -137,7 +136,6 @@ static const char *algo_names[] = { "neoscrypt", "nist5", "penta", - "pluck", "quark", "qubit", "scrypt", @@ -292,7 +290,6 @@ Options:\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ - pluck SupCoin\n\ quark Quark\n\ qubit Qubit\n\ scrypt Scrypt\n\ @@ -595,7 +592,6 @@ static void calc_network_diff(struct work *work) case ALGO_QUARK: diffone = 0xFFFFFF0000000000ull; break; - case ALGO_PLUCK: case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: // cant get the right value on these 3 algos... @@ -1429,7 +1425,6 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) switch (opt_algo) { case ALGO_JACKPOT: case ALGO_NEOSCRYPT: - case ALGO_PLUCK: case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); @@ -1760,9 +1755,6 @@ static void *miner_thread(void *userdata) case ALGO_SCRYPT_JANE: minmax = 0x100000; break; - case ALGO_PLUCK: - minmax = 0x2000; - break; } max64 = max(minmax-1, max64); } @@ -1897,11 +1889,6 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; - case ALGO_PLUCK: - rc = scanhash_pluck(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, work.data, work.target, NULL, max_nonce, &hashes_done, &tv_start, &tv_end); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 438675d..3475f07 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -421,8 +421,6 @@ -Xptxas "-abi=yes" %(AdditionalOptions) -Xptxas "-abi=yes" %(AdditionalOptions) - - diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index dd19768..e9e47a3 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -433,12 +433,6 @@ Source Files\CUDA\JHA - - Source Files\CUDA - - - Source Files\CUDA - Source Files\CUDA\quark diff --git a/cuda_helper.h b/cuda_helper.h index 3b86e91..f89014e 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -309,7 +309,6 @@ uint64_t shr_t64(uint64_t x, uint32_t n) #endif } -// device asm for ? __device__ __forceinline__ uint64_t shl_t64(uint64_t x, uint32_t n) { @@ -324,40 +323,6 @@ uint64_t shl_t64(uint64_t x, uint32_t n) #endif } -// device asm 32 for pluck -__device__ __forceinline__ -uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { -#ifdef __CUDA_ARCH__ - uint32_t result; - asm("{ .reg .u32 m,n,o;\n\t" - "and.b32 m, %1, %2;\n\t" - " or.b32 n, %1, %2;\n\t" - "and.b32 o, n, %3;\n\t" - " or.b32 %0, m, o ;\n\t" - "}\n\t" - : "=r"(result) : "r"(a), "r"(b), "r"(c)); - return result; -#else - // unused on host... - return 0; -#endif -} - -__device__ __forceinline__ -uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { -#ifdef __CUDA_ARCH__ - uint32_t result; - asm("{ .reg .u32 t1;\n\t" - "xor.b32 t1, %2, %3;\n\t" - "xor.b32 %0, %1, t1;\n\t" - "}" - : "=r"(result) : "r"(a) ,"r"(b),"r"(c)); - return result; -#else - return a^b^c; -#endif -} - __device__ __forceinline__ uint32_t shr_t32(uint32_t x,uint32_t n) { #ifdef __CUDA_ARCH__ diff --git a/miner.h b/miner.h index 26edf88..e74a83e 100644 --- a/miner.h +++ b/miner.h @@ -326,10 +326,6 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_pluck(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); - extern int scanhash_qubit(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -781,7 +777,6 @@ void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); -void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); void scrypthash(void* output, const void* input); diff --git a/pluck/cuda_pluck.cu b/pluck/cuda_pluck.cu deleted file mode 100644 index 5a48b73..0000000 --- a/pluck/cuda_pluck.cu +++ /dev/null @@ -1,573 +0,0 @@ -/* - * "pluck" kernel implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2015 djm34 - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author djm34 - * @author tpruvot - */ - -#include -#include -#include - -#include "cuda_vector.h" - -#include "miner.h" - -uint32_t *d_PlNonce[MAX_GPUS]; - -__device__ uint8_t * hashbuffer; -__constant__ uint32_t pTarget[8]; -__constant__ uint32_t c_data[20]; - -#define HASH_MEMORY_8bit 131072 -#define HASH_MEMORY_32bit 32768 -#define HASH_MEMORY 4096 - -static __constant__ uint32_t H256[8] = { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 -}; - -static __constant__ uint32_t Ksha[64] = { - 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, - 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, - 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, - 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, - 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, - 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, - 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, - 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, - 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, - 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, - 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, - 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, - 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, - 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, - 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, - 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 -}; - - -#define SALSA(a,b,c,d) { \ - t = a+d; b^=rotateL(t, 7); \ - t = b+a; c^=rotateL(t, 9); \ - t = c+b; d^=rotateL(t, 13); \ - t = d+c; a^=rotateL(t, 18); \ -} - -#define SALSA_CORE(state) { \ - SALSA(state.s0,state.s4,state.s8,state.sc); \ - SALSA(state.s5,state.s9,state.sd,state.s1); \ - SALSA(state.sa,state.se,state.s2,state.s6); \ - SALSA(state.sf,state.s3,state.s7,state.sb); \ - SALSA(state.s0,state.s1,state.s2,state.s3); \ - SALSA(state.s5,state.s6,state.s7,state.s4); \ - SALSA(state.sa,state.sb,state.s8,state.s9); \ - SALSA(state.sf,state.sc,state.sd,state.se); \ -} - -#if __CUDA_ARCH__ >= 320 -static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx) -{ - uint32_t t; - uint16 state = Bx; - SALSA_CORE(state); - SALSA_CORE(state); - SALSA_CORE(state); - SALSA_CORE(state); - return(state+Bx); -} -#endif - -// sha256 - -static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) -{ - uint32_t r1 = ROTR32(x, 2); - uint32_t r2 = ROTR32(x, 13); - uint32_t r3 = ROTR32(x, 22); - return xor3b(r1, r2, r3); -} - -static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x) -{ - uint32_t r1 = ROTR32(x, 6); - uint32_t r2 = ROTR32(x, 11); - uint32_t r3 = ROTR32(x, 25); - return xor3b(r1, r2, r3); -} - -static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x) -{ - uint64_t r1 = ROTR32(x, 7); - uint64_t r2 = ROTR32(x, 18); - uint64_t r3 = shr_t32(x, 3); - return xor3b(r1, r2, r3); -} - -static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x) -{ - uint64_t r1 = ROTR32(x, 17); - uint64_t r2 = ROTR32(x, 19); - uint64_t r3 = shr_t32(x, 10); - return xor3b(r1, r2, r3); -} - -static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e, - const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared) -{ - uint32_t t1, t2; - uint32_t vxandx = xandx(e, f, g); - uint32_t bsg21 = bsg2_1(e); - uint32_t bsg20 = bsg2_0(a); - uint32_t andorv = andor32(a, b, c); - - t1 = h + bsg21 + vxandx + Kshared + in; - t2 = bsg20 + andorv; - d = d + t1; - h = t1 + t2; -} - -static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e, - const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared) -{ - uint32_t t1, t2; - - int pcidx1 = (pc - 2) & 0xF; - int pcidx2 = (pc - 7) & 0xF; - int pcidx3 = (pc - 15) & 0xF; - uint32_t inx0 = in[pc]; - uint32_t inx1 = in[pcidx1]; - uint32_t inx2 = in[pcidx2]; - uint32_t inx3 = in[pcidx3]; - - uint32_t ssg21 = ssg2_1(inx1); - uint32_t ssg20 = ssg2_0(inx3); - uint32_t vxandx = xandx(e, f, g); - uint32_t bsg21 = bsg2_1(e); - uint32_t bsg20 = bsg2_0(a); - uint32_t andorv = andor32(a, b, c); - - in[pc] = ssg21 + inx2 + ssg20 + inx0; - - t1 = h + bsg21 + vxandx + Kshared + in[pc]; - t2 = bsg20 + andorv; - d = d + t1; - h = t1 + t2; -} - -static __device__ __forceinline__ -void sha2_round_body(uint32_t* in, uint32_t* r) -{ - uint32_t a = r[0]; - uint32_t b = r[1]; - uint32_t c = r[2]; - uint32_t d = r[3]; - uint32_t e = r[4]; - uint32_t f = r[5]; - uint32_t g = r[6]; - uint32_t h = r[7]; - - sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]); - sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]); - sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]); - sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]); - sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]); - sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]); - sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]); - sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]); - sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]); - sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]); - sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]); - sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]); - sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]); - sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]); - sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]); - sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]); - - #pragma unroll 3 - for (int i = 0; i<3; i++) { - - sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]); - sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]); - sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]); - sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]); - sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]); - sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]); - sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]); - sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]); - sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]); - sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]); - sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]); - sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]); - sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]); - sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]); - sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]); - sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]); - - } - - r[0] += a; - r[1] += b; - r[2] += c; - r[3] += d; - r[4] += e; - r[5] += f; - r[6] += g; - r[7] += h; -} - -static __device__ __forceinline__ uint8 sha256_64(uint32_t *data) -{ - uint32_t __align__(64) in[16]; - uint32_t __align__(32) buf[8]; - - ((uint16 *)in)[0] = swapvec((uint16*)data); - - ((uint8*)buf)[0] = ((uint8*)H256)[0]; - - sha2_round_body(in, buf); - - #pragma unroll 14 - for (int i = 0; i<14; i++) { in[i + 1] = 0; } - - in[0] = 0x80000000; - in[15] = 0x200; - - sha2_round_body(in, buf); - return swapvec((uint8*)buf); -} - -static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce) -{ - uint32_t __align__(64) in[16]; - uint32_t __align__(32) buf[8]; - - ((uint16 *)in)[0] = swapvec((uint16*)c_data); - ((uint8*)buf)[0] = ((uint8*)H256)[0]; - - sha2_round_body(in, buf); - - #pragma unroll 3 - for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); } - -// in[3] = cuda_swab32(nonce); - in[3] = nonce; - in[4] = 0x80000000; - in[15] = 0x280; - - #pragma unroll - for (int i = 5; i<15; i++) { in[i] = 0; } - - sha2_round_body(in, buf); - return swapvec((uint8*)buf); -} - -// Pluck Factor 128 -#define SHIFT (1024 * 128) - -__global__ __launch_bounds__(256, 1) -void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { -#if __CUDA_ARCH__ >= 320 - const uint32_t nonce = startNonce + thread; - uint32_t shift = SHIFT * thread; - ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce); - ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0); - for (int i = 2; i < 5; i++) - { - uint32_t randmax = i * 32 - 4; - uint32_t randseed[16]; - uint32_t randbuffer[16]; - uint32_t joint[16]; - - ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); - ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); - - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - ((uint8*)joint)[0] = ((uint8*)randseed)[1]; - - #pragma unroll - for (int j = 0; j < 8; j++) { - uint32_t rand = randbuffer[j] % (randmax - 32); - joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); - } - - uint8 truc = sha256_64(joint); - ((uint8*)(hashbuffer + shift))[i] = truc; - ((uint8*)randseed)[0] = ((uint8*)joint)[0]; - ((uint8*)randseed)[1] = truc; - - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j / 2] % randmax; - (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); - (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); - (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); - (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); - } - } // main loop -#endif - } -} - -__global__ __launch_bounds__(256, 1) -void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { -#if __CUDA_ARCH__ >= 320 - const uint32_t nonce = startNonce + thread; - uint32_t shift = SHIFT * thread; - - for (int i = 5; i < HASH_MEMORY - 1; i++) - { - uint32_t randmax = i*32-4; - uint32_t randseed[16]; - uint32_t randbuffer[16]; - uint32_t joint[16]; - uint8 Buffbuffer[2]; - - ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]); - ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]); - - - Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]); - Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]); - - ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; - ((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]); - ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]); - - #pragma unroll - for (int j = 0; j < 8; j++) { - uint32_t rand = randbuffer[j] % (randmax - 32); - joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); - } - - uint8 truc = sha256_64(joint); - ((uint8*)(hashbuffer + shift))[i] = truc; - ((uint8*)randseed)[0] = ((uint8*)joint)[0]; - ((uint8*)randseed)[1] = truc; - - ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j / 2] % randmax; - - (hashbuffer+shift)[rand] = __ldg(&(hashbuffer+shift)[randmax+j]); - (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); - (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); - (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); - } - - } // main loop - - uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]); - - if (outbuf <= pTarget[7]) { - nonceVector[0] = nonce; - } -#endif - } -} - -__global__ __launch_bounds__(128, 3) -void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { -#if __CUDA_ARCH__ >= 320 - const uint32_t nonce = startNonce + thread; - - uint32_t shift = SHIFT * thread; - ((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce); - ((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0); - for (int i = 2; i < 5; i++) - { - uint32_t randmax = i * 32 - 4; - uint32_t randseed[16]; - uint32_t randbuffer[16]; - uint32_t joint[16]; - - ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); - ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); - - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - ((uint8*)joint)[0] = ((uint8*)randseed)[1]; - - #pragma unroll - for (int j = 0; j < 8; j++) { - uint32_t rand = randbuffer[j] % (randmax - 32); - joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); - } - - uint8 truc = sha256_64(joint); - ((uint8*)(hashbuffer + shift))[i] = truc; - ((uint8*)randseed)[0] = ((uint8*)joint)[0]; - ((uint8*)randseed)[1] = truc; - - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j / 2] % randmax; - (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); - (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); - (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); - (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); - } - } // main loop -#endif - } -} - -__global__ __launch_bounds__(128, 3) -void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { -#if __CUDA_ARCH__ >= 320 - const uint32_t nonce = startNonce + thread; - - uint32_t shift = SHIFT * thread; - - for (int i = 5; i < HASH_MEMORY - 1; i++) - { - uint32_t randmax = i * 32 - 4; - uint32_t randseed[16]; - uint32_t randbuffer[16]; - uint32_t joint[16]; - uint8 Buffbuffer[2]; - - ((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]); - ((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]); - - - Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]); - Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]); - ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; - - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - - ((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]); - - #pragma unroll - for (int j = 0; j < 8; j++) - { - uint32_t rand = randbuffer[j] % (randmax - 32); - joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]); - } - - uint8 truc = sha256_64(joint); - ((uint8*)(hashbuffer + shift))[i] = truc; - ((uint8*)randseed)[0] = ((uint8*)joint)[0]; - ((uint8*)randseed)[1] = truc; - - - ((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0]; - ((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j / 2] % randmax; - - (hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]); - (hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]); - (hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]); - (hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]); - } - } // main loop - - uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]); - - if (outbuf <= pTarget[7]) { - nonceVector[0] = nonce; - } -#endif - } -} - -void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash) -{ - cuda_get_arch(thr_id); - cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); - cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t)); -} - -__host__ -uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order) -{ - uint32_t result[8] = { 0xffffffff }; - cudaMemset(d_PlNonce[thr_id], 0xff, sizeof(uint32_t)); - - const uint32_t threadsperblock = 128; - - dim3 grid((threads + threadsperblock - 1) / threadsperblock); - dim3 block(threadsperblock); - dim3 grid50((threads + 256 - 1) / 256); - dim3 block50(256); - - if (device_sm[device_map[thr_id]] <= 300) { - applog(LOG_ERR,"Sorry pluck not supported on SM 3.0 devices"); - return 0; - } else if (device_sm[device_map[thr_id]] >= 500) { - pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce); - pluck_gpu_hash_v50 <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]); - } else { - pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce); - pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]); - } - - //MyStreamSynchronize(NULL, order, thr_id); - CUDA_SAFE_CALL(cudaThreadSynchronize()); - cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - - return result[thr_id]; -} - -__host__ -void pluck_setBlockTarget(const void *pdata, const void *ptarget) -{ - unsigned char PaddedMessage[80]; - memcpy(PaddedMessage, pdata, 80); - - cudaMemcpyToSymbol(c_data, PaddedMessage, 80, 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice); -} diff --git a/pluck/pluck.cu b/pluck/pluck.cu deleted file mode 100644 index 0141b30..0000000 --- a/pluck/pluck.cu +++ /dev/null @@ -1,240 +0,0 @@ -/* Based on djm code */ - -#include - -#include "miner.h" -#include "cuda_helper.h" - -#include - -static uint32_t *d_hash[MAX_GPUS] ; - -extern void pluck_setBlockTarget(const void* data, const void *ptarget); -extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash); -extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order); - -extern float tp_coef[MAX_GPUS]; - -#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) -//note, this is 64 bytes -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ -#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15; - int i; - - x00 = (B[0] ^= Bx[0]); - x01 = (B[1] ^= Bx[1]); - x02 = (B[2] ^= Bx[2]); - x03 = (B[3] ^= Bx[3]); - x04 = (B[4] ^= Bx[4]); - x05 = (B[5] ^= Bx[5]); - x06 = (B[6] ^= Bx[6]); - x07 = (B[7] ^= Bx[7]); - x08 = (B[8] ^= Bx[8]); - x09 = (B[9] ^= Bx[9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { - /* Operate on columns. */ - x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); - x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); - - x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); - x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); - - x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); - x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); - - x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); - x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); - - /* Operate on rows. */ - x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); - x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); - - x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); - x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); - - x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); - x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); - - x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); - x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); - } - B[0] += x00; - B[1] += x01; - B[2] += x02; - B[3] += x03; - B[4] += x04; - B[5] += x05; - B[6] += x06; - B[7] += x07; - B[8] += x08; - B[9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -#undef ROTL -} - -static void sha256_hash(uchar *hash, const uchar *data, int len) -{ - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, data, len); - SHA256_Final(hash, &ctx); -} - -// hash exactly 64 bytes (ie, sha256 block size) -static void sha256_hash512(uint32_t *hash, const uint32_t *data) -{ - uint32_t _ALIGN(64) S[16]; - uint32_t _ALIGN(64) T[16]; - uchar _ALIGN(64) E[64] = { 0 }; - int i; - - sha256_init(S); - - for (i = 0; i < 16; i++) - T[i] = be32dec(&data[i]); - sha256_transform(S, T, 0); - - E[3] = 0x80; - E[61] = 0x02; // T[15] = 8 * 64 => 0x200; - sha256_transform(S, (uint32_t*)E, 0); - - for (i = 0; i < 8; i++) - be32enc(&hash[i], S[i]); -} - -#define BLOCK_HEADER_SIZE 80 -void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N) -{ - int size = N * 1024; - sha256_hash(hashbuffer, (uchar*)data, BLOCK_HEADER_SIZE); - memset(&hashbuffer[32], 0, 32); - - for (int i = 64; i < size - 32; i += 32) - { - uint32_t _ALIGN(64) randseed[16]; - uint32_t _ALIGN(64) randbuffer[16]; - uint32_t _ALIGN(64) joint[16]; - //i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area - //we could use size here, but then it's probable to use 0 as the value in most cases - int randmax = i - 4; - - //setup randbuffer to be an array of random indexes - memcpy(randseed, &hashbuffer[i - 64], 64); - - if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64); - else memset(randbuffer, 0, 64); - - xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed); - memcpy(joint, &hashbuffer[i - 32], 32); - - //use the last hash value as the seed - for (int j = 32; j < 64; j += 4) - { - //every other time, change to next random index - //randmax - 32 as otherwise we go beyond memory that's already been written to - uint32_t rand = randbuffer[(j - 32) >> 2] % (randmax - 32); - joint[j >> 2] = *((uint32_t *)&hashbuffer[rand]); - } - - sha256_hash512((uint32_t*)&hashbuffer[i], joint); - - //setup randbuffer to be an array of random indexes - //use last hash value and previous hash value(post-mixing) - memcpy(randseed, &hashbuffer[i - 32], 64); - - if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64); - else memset(randbuffer, 0, 64); - - xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed); - - //use the last hash value as the seed - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j >> 1] % randmax; - *((uint32_t *)(hashbuffer + rand)) = *((uint32_t *)(hashbuffer + j + randmax)); - } - } - - memcpy(hash, hashbuffer, 32); -} - -static bool init[MAX_GPUS] = { 0 }; - -static __thread uchar* scratchbuf = NULL; - -extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - const uint32_t first_nonce = pdata[19]; - uint32_t endiandata[20]; - int opt_pluck_n = 128; - - int intensity = is_windows() ? 17 : 19; /* beware > 20 could work and create diff problems later */ - uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); - // divide by 128 for this algo which require a lot of memory - throughput = throughput / 128 - 256; - throughput = min(throughput, max_nonce - first_nonce + 1); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - if (!init[thr_id]) - { - cudaSetDevice(device_map[thr_id]); - //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - //cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - cudaMalloc(&d_hash[thr_id], opt_pluck_n * 1024 * throughput); - - if (!scratchbuf) - scratchbuf = (uchar*) calloc(opt_pluck_n, 1024); - - pluck_cpu_init(thr_id, throughput, d_hash[thr_id]); - - CUDA_SAFE_CALL(cudaGetLastError()); - applog(LOG_INFO, "Using %d cuda threads", throughput); - - init[thr_id] = true; - } - - for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - - pluck_setBlockTarget(endiandata,ptarget); - - do { - uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0); - if (foundNonce != UINT32_MAX) - { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); - pluckhash(vhash64, endiandata, scratchbuf, opt_pluck_n); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - *hashes_done = pdata[19] - first_nonce + throughput; - pdata[19] = foundNonce; - return 1; - } else { - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); - } - } - - pdata[19] += throughput; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = pdata[19] - first_nonce; - return 0; -} diff --git a/util.cpp b/util.cpp index 7f9a21f..96a3464 100644 --- a/util.cpp +++ b/util.cpp @@ -1853,9 +1853,6 @@ void print_hash_tests(void) pentablakehash(&hash[0], &buf[0]); printpfx("pentablake", hash); - pluckhash((uint32_t*)&hash[0], (uint32_t*)&buf[0], scratchbuf, 128); - printpfx("pluck", hash); - quarkhash(&hash[0], &buf[0]); printpfx("quark", hash);