Browse Source

Add sha256d algo (bitcoin) btw...

even if it works on yiimp, the stratum diff is too high for gpus (for now)
2upstream
Tanguy Pruvot 8 years ago
parent
commit
80c755188d
  1. 2
      Makefile.am
  2. 6
      algos.h
  3. 2
      bench.cpp
  4. 6
      ccminer.cpp
  5. 2
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 4
      miner.h
  8. 477
      sha256/cuda_sha256d.cu
  9. 2
      sha256/cuda_sha256t.cu
  10. 127
      sha256/sha256d.cu
  11. 3
      sph/sha2.c
  12. 6
      util.cpp

2
Makefile.am

@ -53,7 +53,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \ quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \
neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
sha256/sha256t.cu sha256/cuda_sha256t.cu \ sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \ sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \

6
algos.h

@ -37,6 +37,7 @@ enum sha_algos {
ALGO_QUBIT, ALGO_QUBIT,
ALGO_SCRYPT, ALGO_SCRYPT,
ALGO_SCRYPT_JANE, ALGO_SCRYPT_JANE,
ALGO_SHA256D,
ALGO_SHA256T, ALGO_SHA256T,
ALGO_SIA, ALGO_SIA,
ALGO_SIB, ALGO_SIB,
@ -96,6 +97,7 @@ static const char *algo_names[] = {
"qubit", "qubit",
"scrypt", "scrypt",
"scrypt-jane", "scrypt-jane",
"sha256d",
"sha256t", "sha256t",
"sia", "sia",
"sib", "sib",
@ -151,6 +153,10 @@ static inline int algo_to_int(char* arg)
i = ALGO_LYRA2; i = ALGO_LYRA2;
else if (!strcasecmp("lyra2rev2", arg)) else if (!strcasecmp("lyra2rev2", arg))
i = ALGO_LYRA2v2; i = ALGO_LYRA2v2;
else if (!strcasecmp("bitcoin", arg))
i = ALGO_SHA256D;
else if (!strcasecmp("sha256", arg))
i = ALGO_SHA256D;
else if (!strcasecmp("thorsriddle", arg)) else if (!strcasecmp("thorsriddle", arg))
i = ALGO_VELTOR; i = ALGO_VELTOR;
else if (!strcasecmp("whirl", arg)) else if (!strcasecmp("whirl", arg))

2
bench.cpp

@ -74,6 +74,7 @@ void algo_free_all(int thr_id)
free_qubit(thr_id); free_qubit(thr_id);
free_skeincoin(thr_id); free_skeincoin(thr_id);
free_skein2(thr_id); free_skein2(thr_id);
free_sha256d(thr_id);
free_sha256t(thr_id); free_sha256t(thr_id);
free_sia(thr_id); free_sia(thr_id);
free_sib(thr_id); free_sib(thr_id);
@ -90,7 +91,6 @@ void algo_free_all(int thr_id)
free_x15(thr_id); free_x15(thr_id);
free_x17(thr_id); free_x17(thr_id);
free_zr5(thr_id); free_zr5(thr_id);
//free_sha256d(thr_id);
free_scrypt(thr_id); free_scrypt(thr_id);
free_scrypt_jane(thr_id); free_scrypt_jane(thr_id);
free_timetravel(thr_id); free_timetravel(thr_id);

6
ccminer.cpp

@ -253,6 +253,7 @@ Options:\n\
penta Pentablake hash (5x Blake 512)\n\ penta Pentablake hash (5x Blake 512)\n\
quark Quark\n\ quark Quark\n\
qubit Qubit\n\ qubit Qubit\n\
sha256d SHA256d (bitcoin)\n\
sha256t SHA256 x3\n\ sha256t SHA256 x3\n\
sia SIA (Blake2B)\n\ sia SIA (Blake2B)\n\
sib Sibcoin (X11+Streebog)\n\ sib Sibcoin (X11+Streebog)\n\
@ -882,6 +883,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
case ALGO_BLAKECOIN: case ALGO_BLAKECOIN:
case ALGO_BLAKE2S: case ALGO_BLAKE2S:
case ALGO_BMW: case ALGO_BMW:
case ALGO_SHA256D:
case ALGO_SHA256T: case ALGO_SHA256T:
case ALGO_VANILLA: case ALGO_VANILLA:
// fast algos require that... (todo: regen hash) // fast algos require that... (todo: regen hash)
@ -2106,6 +2108,7 @@ static void *miner_thread(void *userdata)
case ALGO_BLAKE: case ALGO_BLAKE:
case ALGO_BMW: case ALGO_BMW:
case ALGO_DECRED: case ALGO_DECRED:
case ALGO_SHA256D:
case ALGO_SHA256T: case ALGO_SHA256T:
//case ALGO_WHIRLPOOLX: //case ALGO_WHIRLPOOLX:
minmax = 0x40000000U; minmax = 0x40000000U;
@ -2308,6 +2311,9 @@ static void *miner_thread(void *userdata)
case ALGO_SKEIN2: case ALGO_SKEIN2:
rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_SHA256D:
rc = scanhash_sha256d(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_SHA256T: case ALGO_SHA256T:
rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
break; break;

2
ccminer.vcxproj

@ -414,6 +414,8 @@
<CudaCompile Include="scrypt\titan_kernel.cu"> <CudaCompile Include="scrypt\titan_kernel.cu">
<CodeGeneration>compute_35,sm_35;compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_35,sm_35;compute_50,sm_50</CodeGeneration>
</CudaCompile> </CudaCompile>
<CudaCompile Include="sha256\cuda_sha256d.cu" />
<CudaCompile Include="sha256\sha256d.cu" />
<CudaCompile Include="sha256\cuda_sha256t.cu" /> <CudaCompile Include="sha256\cuda_sha256t.cu" />
<CudaCompile Include="sha256\sha256t.cu" /> <CudaCompile Include="sha256\sha256t.cu" />
<CudaCompile Include="zr5.cu" /> <CudaCompile Include="zr5.cu" />

6
ccminer.vcxproj.filters

@ -859,6 +859,12 @@
<CudaCompile Include="lbry\lbry.cu"> <CudaCompile Include="lbry\lbry.cu">
<Filter>Source Files\CUDA\lbry</Filter> <Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="sha256\cuda_sha256d.cu">
<Filter>Source Files\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sha256\sha256d.cu">
<Filter>Source Files\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sha256\cuda_sha256t.cu"> <CudaCompile Include="sha256\cuda_sha256t.cu">
<Filter>Source Files\sha256</Filter> <Filter>Source Files\sha256</Filter>
</CudaCompile> </CudaCompile>

4
miner.h

@ -300,6 +300,7 @@ extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, uns
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -319,7 +320,6 @@ extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsig
extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, extern int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end); unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end);
extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
@ -355,6 +355,7 @@ extern void free_nist5(int thr_id);
extern void free_pentablake(int thr_id); extern void free_pentablake(int thr_id);
extern void free_quark(int thr_id); extern void free_quark(int thr_id);
extern void free_qubit(int thr_id); extern void free_qubit(int thr_id);
extern void free_sha256d(int thr_id);
extern void free_sha256t(int thr_id); extern void free_sha256t(int thr_id);
extern void free_sia(int thr_id); extern void free_sia(int thr_id);
extern void free_sib(int thr_id); extern void free_sib(int thr_id);
@ -879,6 +880,7 @@ void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input); void qubithash(void *state, const void *input);
void scrypthash(void* output, const void* input); void scrypthash(void* output, const void* input);
void scryptjane_hash(void* output, const void* input); void scryptjane_hash(void* output, const void* input);
void sha256d_hash(void *output, const void *input);
void sha256t_hash(void *output, const void *input); void sha256t_hash(void *output, const void *input);
void sibhash(void *output, const void *input); void sibhash(void *output, const void *input);
void skeincoinhash(void *output, const void *input); void skeincoinhash(void *output, const void *input);

477
sha256/cuda_sha256d.cu

@ -0,0 +1,477 @@
/*
* sha256d CUDA implementation.
* tpruvot 2017
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
#include <miner.h>
__constant__ static uint32_t __align__(8) c_midstate76[8];
__constant__ static uint32_t __align__(8) c_dataEnd80[4];
const __constant__ uint32_t __align__(8) c_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
__constant__ static uint32_t __align__(8) c_K[64];
__constant__ static uint32_t __align__(8) c_target[2];
__device__ uint64_t d_target[1];
static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
// ------------------------------------------------------------------------------------------------
static const uint32_t cpu_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
static const uint32_t cpu_K[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define ROTR ROTR32
__host__
static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
for (int i=0; i<3; i++)
{
sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
#define xor3b(a,b,c) (a ^ b ^ c)
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
}
__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
}
__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
}
__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
}
__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
{
uint32_t result;
asm("{\n\t"
".reg .u32 m,n,o;\n\t"
"and.b32 m, %1, %2;\n\t"
" or.b32 n, %1, %2;\n\t"
"and.b32 o, n, %3;\n\t"
" or.b32 %0, m, o ;\n\t"
"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
);
return result;
}
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.y), "=r"(result.x) : "l"(v));
return result;
}
__device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<3; i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
__device__
static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<2; i++)
{
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
}
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
state[6] += g;
state[7] += h;
}
__device__ __forceinline__
uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
}
__global__
/*__launch_bounds__(256,3)*/
void sha256d_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
__shared__ uint32_t s_K[64*4];
//s_K[thread & 63] = c_K[thread & 63];
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t dat[16];
AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
dat[ 2] = c_dataEnd80[2];
dat[ 3] = nonce;
dat[ 4] = 0x80000000;
dat[15] = 0x280;
#pragma unroll
for (int i=5; i<15; i++) dat[i] = 0;
uint32_t buf[8];
#pragma unroll
for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
sha256_round_body(dat, buf, s_K);
// second sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_last(dat, buf, s_K);
// valid nonces
uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
if (high <= c_target[0]) {
//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
resNonces[1] = atomicExch(resNonces, nonce);
//d_target[0] = high;
}
}
}
__host__
void sha256d_init(int thr_id)
{
cuda_get_arch(thr_id);
cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
}
__host__
void sha256d_free(int thr_id)
{
if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
}
__host__
void sha256d_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
{
uint32_t _ALIGN(64) in[16], buf[8], end[4];
for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
sha256_round_body_host(in, buf, cpu_K);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80, end, sizeof(end), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
}
__host__
void sha256d_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
cudaThreadSynchronize();
sha256d_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
if (resNonces[0] == resNonces[1]) {
resNonces[1] = UINT32_MAX;
}
}

2
sha256/cuda_sha256t.cu

@ -368,7 +368,7 @@ static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Ksh
state[7] += h; state[7] += h;
} }
__device__ __device__ __forceinline__
uint64_t cuda_swab32ll(uint64_t x) { uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
} }

127
sha256/sha256d.cu

@ -0,0 +1,127 @@
/**
* SHA256d
* by tpruvot@github - 2017
*/
#include <miner.h>
#include <cuda_helper.h>
#include <openssl/sha.h>
// CPU Check
extern "C" void sha256d_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[64];
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, (unsigned char *)input, 80);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final((unsigned char *)output, &sha256);
}
static bool init[MAX_GPUS] = { 0 };
extern void sha256d_init(int thr_id);
extern void sha256d_free(int thr_id);
extern void sha256d_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
extern void sha256d_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
extern "C" int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 25);
if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x03;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
sha256d_init(thr_id);
init[thr_id] = true;
}
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
sha256d_setBlock_80(endiandata, ptarget);
do {
// Hash with CUDA
*hashes_done = pdata[19] - first_nonce + throughput;
sha256d_hash_80(thr_id, throughput, pdata[19], work->nonces);
if (work->nonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
endiandata[19] = swab32(work->nonces[0]);
sha256d_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
if (work->nonces[1] != UINT32_MAX) {
endiandata[19] = swab32(work->nonces[1]);
sha256d_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces++;
bn_set_target_ratio(work, vhash, 1);
}
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1;
}
return work->valid_nonces;
}
else if (vhash[7] > ptarget[7]) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_sha256d(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
sha256d_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

3
sph/sha2.c

@ -581,6 +581,7 @@ static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
#endif /* HAVE_SHA256_8WAY */ #endif /* HAVE_SHA256_8WAY */
#if 0
int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
uint32_t _ALIGN(128) data[64]; uint32_t _ALIGN(128) data[64];
@ -629,3 +630,5 @@ int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }
#endif

6
util.cpp

@ -2231,6 +2231,12 @@ void print_hash_tests(void)
scryptjane_hash(&hash[0], &buf[0]); scryptjane_hash(&hash[0], &buf[0]);
printpfx("scrypt-jane", hash); printpfx("scrypt-jane", hash);
sha256d_hash(&hash[0], &buf[0]);
printpfx("sha256d", hash);
sha256t_hash(&hash[0], &buf[0]);
printpfx("sha256t", hash);
sibhash(&hash[0], &buf[0]); sibhash(&hash[0], &buf[0]);
printpfx("sib", hash); printpfx("sib", hash);

Loading…
Cancel
Save