Browse Source

sha256t algo (3x sha256, not sha256d)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
master
Tanguy Pruvot 7 years ago
parent
commit
ebef3c8656
  1. 1
      Makefile.am
  2. 2
      algos.h
  3. 1
      bench.cpp
  4. 6
      ccminer.cpp
  5. 2
      ccminer.vcxproj
  6. 9
      ccminer.vcxproj.filters
  7. 3
      miner.h
  8. 490
      sha256/cuda_sha256t.cu
  9. 133
      sha256/sha256t.cu

1
Makefile.am

@ -51,6 +51,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \ quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \
neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
sha256/sha256t.cu sha256/cuda_sha256t.cu \
sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \ sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \

2
algos.h

@ -35,6 +35,7 @@ enum sha_algos {
ALGO_QUBIT, ALGO_QUBIT,
ALGO_SCRYPT, ALGO_SCRYPT,
ALGO_SCRYPT_JANE, ALGO_SCRYPT_JANE,
ALGO_SHA256T,
ALGO_SIA, ALGO_SIA,
ALGO_SIB, ALGO_SIB,
ALGO_SKEIN, ALGO_SKEIN,
@ -90,6 +91,7 @@ static const char *algo_names[] = {
"qubit", "qubit",
"scrypt", "scrypt",
"scrypt-jane", "scrypt-jane",
"sha256t",
"sia", "sia",
"sib", "sib",
"skein", "skein",

1
bench.cpp

@ -71,6 +71,7 @@ void algo_free_all(int thr_id)
free_qubit(thr_id); free_qubit(thr_id);
free_skeincoin(thr_id); free_skeincoin(thr_id);
free_skein2(thr_id); free_skein2(thr_id);
free_sha256t(thr_id);
free_sia(thr_id); free_sia(thr_id);
free_sib(thr_id); free_sib(thr_id);
free_s3(thr_id); free_s3(thr_id);

6
ccminer.cpp

@ -251,6 +251,7 @@ Options:\n\
penta Pentablake hash (5x Blake 512)\n\ penta Pentablake hash (5x Blake 512)\n\
quark Quark\n\ quark Quark\n\
qubit Qubit\n\ qubit Qubit\n\
sha256t SHA256 x3\n\
sia SIA (Blake2B)\n\ sia SIA (Blake2B)\n\
sib Sibcoin (X11+Streebog)\n\ sib Sibcoin (X11+Streebog)\n\
scrypt Scrypt\n\ scrypt Scrypt\n\
@ -878,6 +879,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
case ALGO_BLAKECOIN: case ALGO_BLAKECOIN:
case ALGO_BLAKE2S: case ALGO_BLAKE2S:
case ALGO_BMW: case ALGO_BMW:
case ALGO_SHA256T:
case ALGO_VANILLA: case ALGO_VANILLA:
// fast algos require that... (todo: regen hash) // fast algos require that... (todo: regen hash)
check_dups = true; check_dups = true;
@ -2099,6 +2101,7 @@ static void *miner_thread(void *userdata)
case ALGO_BLAKE: case ALGO_BLAKE:
case ALGO_BMW: case ALGO_BMW:
case ALGO_DECRED: case ALGO_DECRED:
case ALGO_SHA256T:
//case ALGO_WHIRLPOOLX: //case ALGO_WHIRLPOOLX:
minmax = 0x40000000U; minmax = 0x40000000U;
break; break;
@ -2292,6 +2295,9 @@ static void *miner_thread(void *userdata)
case ALGO_SKEIN2: case ALGO_SKEIN2:
rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_SHA256T:
rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_SIA: case ALGO_SIA:
rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done);
break; break;

2
ccminer.vcxproj

@ -417,6 +417,8 @@
<CudaCompile Include="scrypt\titan_kernel.cu"> <CudaCompile Include="scrypt\titan_kernel.cu">
<CodeGeneration>compute_35,sm_35;compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_35,sm_35;compute_50,sm_50</CodeGeneration>
</CudaCompile> </CudaCompile>
<CudaCompile Include="sha256\cuda_sha256t.cu" />
<CudaCompile Include="sha256\sha256t.cu" />
<CudaCompile Include="zr5.cu" /> <CudaCompile Include="zr5.cu" />
<CudaCompile Include="heavy\cuda_blake512.cu"> <CudaCompile Include="heavy\cuda_blake512.cu">
</CudaCompile> </CudaCompile>

9
ccminer.vcxproj.filters

@ -85,6 +85,9 @@
<Filter Include="Source Files\CUDA\lbry"> <Filter Include="Source Files\CUDA\lbry">
<UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier> <UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\sha256">
<UniqueIdentifier>{86a896c0-1688-4854-98e3-285d166069a3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\sia"> <Filter Include="Source Files\sia">
<UniqueIdentifier>{86a896c0-1688-4854-98e0-285d166069a3}</UniqueIdentifier> <UniqueIdentifier>{86a896c0-1688-4854-98e0-285d166069a3}</UniqueIdentifier>
</Filter> </Filter>
@ -832,6 +835,12 @@
<CudaCompile Include="lbry\lbry.cu"> <CudaCompile Include="lbry\lbry.cu">
<Filter>Source Files\CUDA\lbry</Filter> <Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="sha256\cuda_sha256t.cu">
<Filter>Source Files\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sha256\sha256t.cu">
<Filter>Source Files\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sia\sia.cu"> <CudaCompile Include="sia\sia.cu">
<Filter>Source Files\sia</Filter> <Filter>Source Files\sia</Filter>
</CudaCompile> </CudaCompile>

3
miner.h

@ -298,6 +298,7 @@ extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, uns
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -349,6 +350,7 @@ extern void free_nist5(int thr_id);
extern void free_pentablake(int thr_id); extern void free_pentablake(int thr_id);
extern void free_quark(int thr_id); extern void free_quark(int thr_id);
extern void free_qubit(int thr_id); extern void free_qubit(int thr_id);
extern void free_sha256t(int thr_id);
extern void free_sia(int thr_id); extern void free_sia(int thr_id);
extern void free_sib(int thr_id); extern void free_sib(int thr_id);
extern void free_skeincoin(int thr_id); extern void free_skeincoin(int thr_id);
@ -869,6 +871,7 @@ void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input); void qubithash(void *state, const void *input);
void scrypthash(void* output, const void* input); void scrypthash(void* output, const void* input);
void scryptjane_hash(void* output, const void* input); void scryptjane_hash(void* output, const void* input);
void sha256t_hash(void *output, const void *input);
void sibhash(void *output, const void *input); void sibhash(void *output, const void *input);
void skeincoinhash(void *output, const void *input); void skeincoinhash(void *output, const void *input);
void skein2hash(void *output, const void *input); void skein2hash(void *output, const void *input);

490
sha256/cuda_sha256t.cu

@ -0,0 +1,490 @@
/*
* sha256(-t) CUDA implementation.
* tpruvot 2017
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
#include <miner.h>
__constant__ static uint32_t __align__(8) c_midstate76[8];
__constant__ static uint32_t __align__(8) c_dataEnd80[4];
const __constant__ uint32_t __align__(8) c_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
__constant__ static uint32_t __align__(8) c_K[64];
__constant__ static uint32_t __align__(8) c_target[2];
__device__ uint64_t d_target[1];
static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
// ------------------------------------------------------------------------------------------------
static const uint32_t cpu_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
static const uint32_t cpu_K[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define ROTR ROTR32
__host__
static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
for (int i=0; i<3; i++)
{
sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
#define xor3b(a,b,c) (a ^ b ^ c)
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
}
__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
}
__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
}
__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
}
__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
{
uint32_t result;
asm("{\n\t"
".reg .u32 m,n,o;\n\t"
"and.b32 m, %1, %2;\n\t"
" or.b32 n, %1, %2;\n\t"
"and.b32 o, n, %3;\n\t"
" or.b32 %0, m, o ;\n\t"
"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
);
return result;
}
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.y), "=r"(result.x) : "l"(v));
return result;
}
__device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<3; i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
__device__
static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<2; i++)
{
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
}
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
state[6] += g;
state[7] += h;
}
__device__
uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
}
__global__
/*__launch_bounds__(256,3)*/
void sha256t_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
__shared__ uint32_t s_K[64*4];
//s_K[thread & 63] = c_K[thread & 63];
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t dat[16];
AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
dat[ 2] = c_dataEnd80[2];
dat[ 3] = nonce;
dat[ 4] = 0x80000000;
dat[15] = 0x280;
#pragma unroll
for (int i=5; i<15; i++) dat[i] = 0;
uint32_t buf[8];
#pragma unroll
for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
sha256_round_body(dat, buf, s_K);
// second sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, s_K);
// last sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_last(dat, buf, s_K);
// valid nonces
uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
if (high <= c_target[0]) {
//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
resNonces[1] = atomicExch(resNonces, nonce);
//d_target[0] = high;
}
}
}
__host__
void sha256t_init(int thr_id)
{
cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
}
__host__
void sha256t_free(int thr_id)
{
if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
}
__host__
void sha256t_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
{
uint32_t _ALIGN(64) in[16], buf[8], end[4];
for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
sha256_round_body_host(in, buf, cpu_K);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80, end, sizeof(end), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
}
__host__
void sha256t_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
cudaThreadSynchronize();
sha256t_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
if (resNonces[0] == resNonces[1]) {
resNonces[1] = UINT32_MAX;
}
}

133
sha256/sha256t.cu

@ -0,0 +1,133 @@
/**
* SHA256 3x
* by tpruvot@github - 2017
*/
#include <miner.h>
#include <cuda_helper.h>
#include <openssl/sha.h>
// CPU Check
extern "C" void sha256t_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[64];
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, (unsigned char *)input, 80);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final((unsigned char *)output, &sha256);
}
static bool init[MAX_GPUS] = { 0 };
extern void sha256t_init(int thr_id);
extern void sha256t_free(int thr_id);
extern void sha256t_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
extern void sha256t_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
extern "C" int scanhash_sha256t(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
bool checkSecnonce = true;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x03;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
sha256t_init(thr_id);
init[thr_id] = true;
}
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
sha256t_setBlock_80(endiandata, ptarget);
do {
// Hash with CUDA
*hashes_done = pdata[19] - first_nonce + throughput;
sha256t_hash_80(thr_id, throughput, pdata[19], work->nonces);
if (work->nonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
endiandata[19] = swab32(work->nonces[0]);
sha256t_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
if (work->nonces[1] != UINT32_MAX) {
endiandata[19] = swab32(work->nonces[1]);
sha256t_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces++;
bn_set_target_ratio(work, vhash, 1);
}
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1;
}
return work->valid_nonces;
}
else if (vhash[7] > ptarget[7]) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_sha256t(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
sha256t_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}
Loading…
Cancel
Save