Browse Source

remove pluck algo

Supcoin seems.... dead and the algo was not supported on all devices
master
Tanguy Pruvot 10 years ago
parent
commit
15293d063f
  1. 1
      Makefile.am
  2. 5
      README.txt
  3. 13
      ccminer.cpp
  4. 2
      ccminer.vcxproj
  5. 6
      ccminer.vcxproj.filters
  6. 35
      cuda_helper.h
  7. 5
      miner.h
  8. 573
      pluck/cuda_pluck.cu
  9. 240
      pluck/pluck.cu
  10. 3
      util.cpp

1
Makefile.am

@ -47,7 +47,6 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
pluck/pluck.cu pluck/cuda_pluck.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \

5
README.txt

@ -81,7 +81,6 @@ its command line interface and options.
neoscrypt use to mine FeatherCoin neoscrypt use to mine FeatherCoin
nist5 use to mine TalkCoin nist5 use to mine TalkCoin
penta use to mine Joincoin / Pentablake penta use to mine Joincoin / Pentablake
pluck use to mine Supcoin
quark use to mine Quarkcoin quark use to mine Quarkcoin
qubit use to mine Qubit qubit use to mine Qubit
scrypt use to mine Scrypt coins scrypt use to mine Scrypt coins
@ -221,6 +220,10 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
July 2015...
Nvml api power limits
Remove pluck algo
June 23th 2015 v1.6.5 June 23th 2015 v1.6.5
Handle Ziftrcoin PoK solo mining Handle Ziftrcoin PoK solo mining
Basic compatibility with CUDA 7.0 (generally slower hashrate) Basic compatibility with CUDA 7.0 (generally slower hashrate)

13
ccminer.cpp

@ -102,7 +102,6 @@ enum sha_algos {
ALGO_NEOSCRYPT, ALGO_NEOSCRYPT,
ALGO_NIST5, ALGO_NIST5,
ALGO_PENTABLAKE, ALGO_PENTABLAKE,
ALGO_PLUCK,
ALGO_QUARK, ALGO_QUARK,
ALGO_QUBIT, ALGO_QUBIT,
ALGO_SCRYPT, ALGO_SCRYPT,
@ -137,7 +136,6 @@ static const char *algo_names[] = {
"neoscrypt", "neoscrypt",
"nist5", "nist5",
"penta", "penta",
"pluck",
"quark", "quark",
"qubit", "qubit",
"scrypt", "scrypt",
@ -292,7 +290,6 @@ Options:\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\ nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\ penta Pentablake hash (5x Blake 512)\n\
pluck SupCoin\n\
quark Quark\n\ quark Quark\n\
qubit Qubit\n\ qubit Qubit\n\
scrypt Scrypt\n\ scrypt Scrypt\n\
@ -595,7 +592,6 @@ static void calc_network_diff(struct work *work)
case ALGO_QUARK: case ALGO_QUARK:
diffone = 0xFFFFFF0000000000ull; diffone = 0xFFFFFF0000000000ull;
break; break;
case ALGO_PLUCK:
case ALGO_SCRYPT: case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE: case ALGO_SCRYPT_JANE:
// cant get the right value on these 3 algos... // cant get the right value on these 3 algos...
@ -1429,7 +1425,6 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
switch (opt_algo) { switch (opt_algo) {
case ALGO_JACKPOT: case ALGO_JACKPOT:
case ALGO_NEOSCRYPT: case ALGO_NEOSCRYPT:
case ALGO_PLUCK:
case ALGO_SCRYPT: case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE: case ALGO_SCRYPT_JANE:
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
@ -1760,9 +1755,6 @@ static void *miner_thread(void *userdata)
case ALGO_SCRYPT_JANE: case ALGO_SCRYPT_JANE:
minmax = 0x100000; minmax = 0x100000;
break; break;
case ALGO_PLUCK:
minmax = 0x2000;
break;
} }
max64 = max(minmax-1, max64); max64 = max(minmax-1, max64);
} }
@ -1897,11 +1889,6 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_PLUCK:
rc = scanhash_pluck(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_SCRYPT: case ALGO_SCRYPT:
rc = scanhash_scrypt(thr_id, work.data, work.target, NULL, rc = scanhash_scrypt(thr_id, work.data, work.target, NULL,
max_nonce, &hashes_done, &tv_start, &tv_end); max_nonce, &hashes_done, &tv_start, &tv_end);

2
ccminer.vcxproj

@ -421,8 +421,6 @@
<AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="pluck\pluck.cu" />
<CudaCompile Include="pluck\cuda_pluck.cu" />
<CudaCompile Include="quark\cuda_quark_groestl512.cu"> <CudaCompile Include="quark\cuda_quark_groestl512.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="quark\cuda_quark_keccak512.cu"> <CudaCompile Include="quark\cuda_quark_keccak512.cu">

6
ccminer.vcxproj.filters

@ -433,12 +433,6 @@
<CudaCompile Include="JHA\cuda_jha_compactionTest.cu"> <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
<Filter>Source Files\CUDA\JHA</Filter> <Filter>Source Files\CUDA\JHA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="pluck\pluck.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="pluck\cuda_pluck.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="quark\cuda_jh512.cu"> <CudaCompile Include="quark\cuda_jh512.cu">
<Filter>Source Files\CUDA\quark</Filter> <Filter>Source Files\CUDA\quark</Filter>
</CudaCompile> </CudaCompile>

35
cuda_helper.h

@ -309,7 +309,6 @@ uint64_t shr_t64(uint64_t x, uint32_t n)
#endif #endif
} }
// device asm for ?
__device__ __forceinline__ __device__ __forceinline__
uint64_t shl_t64(uint64_t x, uint32_t n) uint64_t shl_t64(uint64_t x, uint32_t n)
{ {
@ -324,40 +323,6 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
#endif #endif
} }
// device asm 32 for pluck
__device__ __forceinline__
uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
#ifdef __CUDA_ARCH__
uint32_t result;
asm("{ .reg .u32 m,n,o;\n\t"
"and.b32 m, %1, %2;\n\t"
" or.b32 n, %1, %2;\n\t"
"and.b32 o, n, %3;\n\t"
" or.b32 %0, m, o ;\n\t"
"}\n\t"
: "=r"(result) : "r"(a), "r"(b), "r"(c));
return result;
#else
// unused on host...
return 0;
#endif
}
__device__ __forceinline__
uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
#ifdef __CUDA_ARCH__
uint32_t result;
asm("{ .reg .u32 t1;\n\t"
"xor.b32 t1, %2, %3;\n\t"
"xor.b32 %0, %1, t1;\n\t"
"}"
: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
return result;
#else
return a^b^c;
#endif
}
__device__ __forceinline__ __device__ __forceinline__
uint32_t shr_t32(uint32_t x,uint32_t n) { uint32_t shr_t32(uint32_t x,uint32_t n) {
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__

5
miner.h

@ -326,10 +326,6 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_pluck(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, uint32_t *pdata, extern int scanhash_qubit(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
@ -781,7 +777,6 @@ void myriadhash(void *state, const void *input);
void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input); void pentablakehash(void *output, const void *input);
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input); void qubithash(void *state, const void *input);
void scrypthash(void* output, const void* input); void scrypthash(void* output, const void* input);

573
pluck/cuda_pluck.cu

@ -1,573 +0,0 @@
/*
* "pluck" kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2015 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
* @author tpruvot
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "cuda_vector.h"
#include "miner.h"
uint32_t *d_PlNonce[MAX_GPUS];
__device__ uint8_t * hashbuffer;
__constant__ uint32_t pTarget[8];
__constant__ uint32_t c_data[20];
#define HASH_MEMORY_8bit 131072
#define HASH_MEMORY_32bit 32768
#define HASH_MEMORY 4096
static __constant__ uint32_t H256[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static __constant__ uint32_t Ksha[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define SALSA(a,b,c,d) { \
t = a+d; b^=rotateL(t, 7); \
t = b+a; c^=rotateL(t, 9); \
t = c+b; d^=rotateL(t, 13); \
t = d+c; a^=rotateL(t, 18); \
}
#define SALSA_CORE(state) { \
SALSA(state.s0,state.s4,state.s8,state.sc); \
SALSA(state.s5,state.s9,state.sd,state.s1); \
SALSA(state.sa,state.se,state.s2,state.s6); \
SALSA(state.sf,state.s3,state.s7,state.sb); \
SALSA(state.s0,state.s1,state.s2,state.s3); \
SALSA(state.s5,state.s6,state.s7,state.s4); \
SALSA(state.sa,state.sb,state.s8,state.s9); \
SALSA(state.sf,state.sc,state.sd,state.se); \
}
#if __CUDA_ARCH__ >= 320
static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx)
{
uint32_t t;
uint16 state = Bx;
SALSA_CORE(state);
SALSA_CORE(state);
SALSA_CORE(state);
SALSA_CORE(state);
return(state+Bx);
}
#endif
// sha256
static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
uint32_t r1 = ROTR32(x, 2);
uint32_t r2 = ROTR32(x, 13);
uint32_t r3 = ROTR32(x, 22);
return xor3b(r1, r2, r3);
}
static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
uint32_t r1 = ROTR32(x, 6);
uint32_t r2 = ROTR32(x, 11);
uint32_t r3 = ROTR32(x, 25);
return xor3b(r1, r2, r3);
}
static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
uint64_t r1 = ROTR32(x, 7);
uint64_t r2 = ROTR32(x, 18);
uint64_t r3 = shr_t32(x, 3);
return xor3b(r1, r2, r3);
}
static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
uint64_t r1 = ROTR32(x, 17);
uint64_t r2 = ROTR32(x, 19);
uint64_t r3 = shr_t32(x, 10);
return xor3b(r1, r2, r3);
}
static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared)
{
uint32_t t1, t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a, b, c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared)
{
uint32_t t1, t2;
int pcidx1 = (pc - 2) & 0xF;
int pcidx2 = (pc - 7) & 0xF;
int pcidx3 = (pc - 15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a, b, c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__
void sha2_round_body(uint32_t* in, uint32_t* r)
{
uint32_t a = r[0];
uint32_t b = r[1];
uint32_t c = r[2];
uint32_t d = r[3];
uint32_t e = r[4];
uint32_t f = r[5];
uint32_t g = r[6];
uint32_t h = r[7];
sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]);
sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]);
sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]);
sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]);
sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]);
sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]);
sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]);
sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]);
sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]);
sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]);
sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]);
sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]);
sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]);
sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]);
sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]);
sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]);
#pragma unroll 3
for (int i = 0; i<3; i++) {
sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]);
sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]);
sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]);
sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]);
sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]);
sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]);
sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]);
sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]);
sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]);
sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]);
sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]);
sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]);
sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]);
sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]);
sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]);
sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]);
}
r[0] += a;
r[1] += b;
r[2] += c;
r[3] += d;
r[4] += e;
r[5] += f;
r[6] += g;
r[7] += h;
}
static __device__ __forceinline__ uint8 sha256_64(uint32_t *data)
{
uint32_t __align__(64) in[16];
uint32_t __align__(32) buf[8];
((uint16 *)in)[0] = swapvec((uint16*)data);
((uint8*)buf)[0] = ((uint8*)H256)[0];
sha2_round_body(in, buf);
#pragma unroll 14
for (int i = 0; i<14; i++) { in[i + 1] = 0; }
in[0] = 0x80000000;
in[15] = 0x200;
sha2_round_body(in, buf);
return swapvec((uint8*)buf);
}
static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce)
{
uint32_t __align__(64) in[16];
uint32_t __align__(32) buf[8];
((uint16 *)in)[0] = swapvec((uint16*)c_data);
((uint8*)buf)[0] = ((uint8*)H256)[0];
sha2_round_body(in, buf);
#pragma unroll 3
for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); }
// in[3] = cuda_swab32(nonce);
in[3] = nonce;
in[4] = 0x80000000;
in[15] = 0x280;
#pragma unroll
for (int i = 5; i<15; i++) { in[i] = 0; }
sha2_round_body(in, buf);
return swapvec((uint8*)buf);
}
// Pluck Factor 128
#define SHIFT (1024 * 128)
__global__ __launch_bounds__(256, 1)
void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 320
const uint32_t nonce = startNonce + thread;
uint32_t shift = SHIFT * thread;
((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
for (int i = 2; i < 5; i++)
{
uint32_t randmax = i * 32 - 4;
uint32_t randseed[16];
uint32_t randbuffer[16];
uint32_t joint[16];
((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
((uint8*)joint)[0] = ((uint8*)randseed)[1];
#pragma unroll
for (int j = 0; j < 8; j++) {
uint32_t rand = randbuffer[j] % (randmax - 32);
joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
}
uint8 truc = sha256_64(joint);
((uint8*)(hashbuffer + shift))[i] = truc;
((uint8*)randseed)[0] = ((uint8*)joint)[0];
((uint8*)randseed)[1] = truc;
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
for (int j = 0; j < 32; j += 2)
{
uint32_t rand = randbuffer[j / 2] % randmax;
(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
}
} // main loop
#endif
}
}
__global__ __launch_bounds__(256, 1)
void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 320
const uint32_t nonce = startNonce + thread;
uint32_t shift = SHIFT * thread;
for (int i = 5; i < HASH_MEMORY - 1; i++)
{
uint32_t randmax = i*32-4;
uint32_t randseed[16];
uint32_t randbuffer[16];
uint32_t joint[16];
uint8 Buffbuffer[2];
((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]);
((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]);
Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]);
Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]);
((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]);
((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]);
#pragma unroll
for (int j = 0; j < 8; j++) {
uint32_t rand = randbuffer[j] % (randmax - 32);
joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
}
uint8 truc = sha256_64(joint);
((uint8*)(hashbuffer + shift))[i] = truc;
((uint8*)randseed)[0] = ((uint8*)joint)[0];
((uint8*)randseed)[1] = truc;
((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
for (int j = 0; j < 32; j += 2)
{
uint32_t rand = randbuffer[j / 2] % randmax;
(hashbuffer+shift)[rand] = __ldg(&(hashbuffer+shift)[randmax+j]);
(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
}
} // main loop
uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
if (outbuf <= pTarget[7]) {
nonceVector[0] = nonce;
}
#endif
}
}
__global__ __launch_bounds__(128, 3)
void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 320
const uint32_t nonce = startNonce + thread;
uint32_t shift = SHIFT * thread;
((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
for (int i = 2; i < 5; i++)
{
uint32_t randmax = i * 32 - 4;
uint32_t randseed[16];
uint32_t randbuffer[16];
uint32_t joint[16];
((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
((uint8*)joint)[0] = ((uint8*)randseed)[1];
#pragma unroll
for (int j = 0; j < 8; j++) {
uint32_t rand = randbuffer[j] % (randmax - 32);
joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
}
uint8 truc = sha256_64(joint);
((uint8*)(hashbuffer + shift))[i] = truc;
((uint8*)randseed)[0] = ((uint8*)joint)[0];
((uint8*)randseed)[1] = truc;
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
for (int j = 0; j < 32; j += 2)
{
uint32_t rand = randbuffer[j / 2] % randmax;
(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
}
} // main loop
#endif
}
}
__global__ __launch_bounds__(128, 3)
void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 320
const uint32_t nonce = startNonce + thread;
uint32_t shift = SHIFT * thread;
for (int i = 5; i < HASH_MEMORY - 1; i++)
{
uint32_t randmax = i * 32 - 4;
uint32_t randseed[16];
uint32_t randbuffer[16];
uint32_t joint[16];
uint8 Buffbuffer[2];
((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]);
Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]);
((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);
#pragma unroll
for (int j = 0; j < 8; j++)
{
uint32_t rand = randbuffer[j] % (randmax - 32);
joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
}
uint8 truc = sha256_64(joint);
((uint8*)(hashbuffer + shift))[i] = truc;
((uint8*)randseed)[0] = ((uint8*)joint)[0];
((uint8*)randseed)[1] = truc;
((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
for (int j = 0; j < 32; j += 2)
{
uint32_t rand = randbuffer[j / 2] % randmax;
(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
}
} // main loop
uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
if (outbuf <= pTarget[7]) {
nonceVector[0] = nonce;
}
#endif
}
}
void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
{
cuda_get_arch(thr_id);
cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t));
}
__host__
uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order)
{
uint32_t result[8] = { 0xffffffff };
cudaMemset(d_PlNonce[thr_id], 0xff, sizeof(uint32_t));
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
dim3 grid50((threads + 256 - 1) / 256);
dim3 block50(256);
if (device_sm[device_map[thr_id]] <= 300) {
applog(LOG_ERR,"Sorry pluck not supported on SM 3.0 devices");
return 0;
} else if (device_sm[device_map[thr_id]] >= 500) {
pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
pluck_gpu_hash_v50 <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
} else {
pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce);
pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
}
//MyStreamSynchronize(NULL, order, thr_id);
CUDA_SAFE_CALL(cudaThreadSynchronize());
cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
return result[thr_id];
}
__host__
void pluck_setBlockTarget(const void *pdata, const void *ptarget)
{
unsigned char PaddedMessage[80];
memcpy(PaddedMessage, pdata, 80);
cudaMemcpyToSymbol(c_data, PaddedMessage, 80, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
}

240
pluck/pluck.cu

@ -1,240 +0,0 @@
/* Based on djm code */
#include <stdint.h>
#include "miner.h"
#include "cuda_helper.h"
#include <openssl/sha.h>
static uint32_t *d_hash[MAX_GPUS] ;
extern void pluck_setBlockTarget(const void* data, const void *ptarget);
extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);
extern float tp_coef[MAX_GPUS];
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
//note, this is 64 bytes
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
int i;
x00 = (B[0] ^= Bx[0]);
x01 = (B[1] ^= Bx[1]);
x02 = (B[2] ^= Bx[2]);
x03 = (B[3] ^= Bx[3]);
x04 = (B[4] ^= Bx[4]);
x05 = (B[5] ^= Bx[5]);
x06 = (B[6] ^= Bx[6]);
x07 = (B[7] ^= Bx[7]);
x08 = (B[8] ^= Bx[8]);
x09 = (B[9] ^= Bx[9]);
x10 = (B[10] ^= Bx[10]);
x11 = (B[11] ^= Bx[11]);
x12 = (B[12] ^= Bx[12]);
x13 = (B[13] ^= Bx[13]);
x14 = (B[14] ^= Bx[14]);
x15 = (B[15] ^= Bx[15]);
for (i = 0; i < 8; i += 2) {
/* Operate on columns. */
x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7);
x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7);
x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9);
x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9);
x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13);
x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13);
x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18);
x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18);
/* Operate on rows. */
x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7);
x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7);
x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9);
x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9);
x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13);
x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13);
x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18);
x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18);
}
B[0] += x00;
B[1] += x01;
B[2] += x02;
B[3] += x03;
B[4] += x04;
B[5] += x05;
B[6] += x06;
B[7] += x07;
B[8] += x08;
B[9] += x09;
B[10] += x10;
B[11] += x11;
B[12] += x12;
B[13] += x13;
B[14] += x14;
B[15] += x15;
#undef ROTL
}
static void sha256_hash(uchar *hash, const uchar *data, int len)
{
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, data, len);
SHA256_Final(hash, &ctx);
}
// hash exactly 64 bytes (ie, sha256 block size)
static void sha256_hash512(uint32_t *hash, const uint32_t *data)
{
uint32_t _ALIGN(64) S[16];
uint32_t _ALIGN(64) T[16];
uchar _ALIGN(64) E[64] = { 0 };
int i;
sha256_init(S);
for (i = 0; i < 16; i++)
T[i] = be32dec(&data[i]);
sha256_transform(S, T, 0);
E[3] = 0x80;
E[61] = 0x02; // T[15] = 8 * 64 => 0x200;
sha256_transform(S, (uint32_t*)E, 0);
for (i = 0; i < 8; i++)
be32enc(&hash[i], S[i]);
}
#define BLOCK_HEADER_SIZE 80
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N)
{
int size = N * 1024;
sha256_hash(hashbuffer, (uchar*)data, BLOCK_HEADER_SIZE);
memset(&hashbuffer[32], 0, 32);
for (int i = 64; i < size - 32; i += 32)
{
uint32_t _ALIGN(64) randseed[16];
uint32_t _ALIGN(64) randbuffer[16];
uint32_t _ALIGN(64) joint[16];
//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
//we could use size here, but then it's probable to use 0 as the value in most cases
int randmax = i - 4;
//setup randbuffer to be an array of random indexes
memcpy(randseed, &hashbuffer[i - 64], 64);
if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
else memset(randbuffer, 0, 64);
xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed);
memcpy(joint, &hashbuffer[i - 32], 32);
//use the last hash value as the seed
for (int j = 32; j < 64; j += 4)
{
//every other time, change to next random index
//randmax - 32 as otherwise we go beyond memory that's already been written to
uint32_t rand = randbuffer[(j - 32) >> 2] % (randmax - 32);
joint[j >> 2] = *((uint32_t *)&hashbuffer[rand]);
}
sha256_hash512((uint32_t*)&hashbuffer[i], joint);
//setup randbuffer to be an array of random indexes
//use last hash value and previous hash value(post-mixing)
memcpy(randseed, &hashbuffer[i - 32], 64);
if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
else memset(randbuffer, 0, 64);
xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed);
//use the last hash value as the seed
for (int j = 0; j < 32; j += 2)
{
uint32_t rand = randbuffer[j >> 1] % randmax;
*((uint32_t *)(hashbuffer + rand)) = *((uint32_t *)(hashbuffer + j + randmax));
}
}
memcpy(hash, hashbuffer, 32);
}
static bool init[MAX_GPUS] = { 0 };
static __thread uchar* scratchbuf = NULL;
extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20];
int opt_pluck_n = 128;
int intensity = is_windows() ? 17 : 19; /* beware > 20 could work and create diff problems later */
uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
// divide by 128 for this algo which require a lot of memory
throughput = throughput / 128 - 256;
throughput = min(throughput, max_nonce - first_nonce + 1);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
cudaMalloc(&d_hash[thr_id], opt_pluck_n * 1024 * throughput);
if (!scratchbuf)
scratchbuf = (uchar*) calloc(opt_pluck_n, 1024);
pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
CUDA_SAFE_CALL(cudaGetLastError());
applog(LOG_INFO, "Using %d cuda threads", throughput);
init[thr_id] = true;
}
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
pluck_setBlockTarget(endiandata,ptarget);
do {
uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
if (foundNonce != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
pluckhash(vhash64, endiandata, scratchbuf, opt_pluck_n);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
*hashes_done = pdata[19] - first_nonce + throughput;
pdata[19] = foundNonce;
return 1;
} else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}

3
util.cpp

@ -1853,9 +1853,6 @@ void print_hash_tests(void)
pentablakehash(&hash[0], &buf[0]); pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash); printpfx("pentablake", hash);
pluckhash((uint32_t*)&hash[0], (uint32_t*)&buf[0], scratchbuf, 128);
printpfx("pluck", hash);
quarkhash(&hash[0], &buf[0]); quarkhash(&hash[0], &buf[0]);
printpfx("quark", hash); printpfx("quark", hash);

Loading…
Cancel
Save