Browse Source

timetravel algo

+ new kernels jh512-80 groestl-80 and cubehash-80

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
2upstream
Tanguy Pruvot 7 years ago
parent
commit
07ebcb544d
  1. 2
      Makefile.am
  2. 2
      algos.h
  3. 1
      bench.cpp
  4. 6
      ccminer.cpp
  5. 1
      ccminer.vcxproj
  6. 3
      ccminer.vcxproj.filters
  7. 3
      miner.h
  8. 83
      quark/cuda_jh512.cu
  9. 95
      quark/cuda_quark_groestl512.cu
  10. 92
      quark/cuda_quark_groestl512_sm2.cuh
  11. 35
      quark/cuda_quark_keccak512.cu
  12. 2
      qubit/qubit_luffa512.cu
  13. 3
      util.cpp
  14. 121
      x11/cuda_x11_cubehash512.cu
  15. 554
      x11/timetravel.cu

2
Makefile.am

@ -64,7 +64,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -64,7 +64,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu \

2
algos.h

@ -43,6 +43,7 @@ enum sha_algos { @@ -43,6 +43,7 @@ enum sha_algos {
ALGO_SKEIN,
ALGO_SKEIN2,
ALGO_S3,
ALGO_TIMETRAVEL,
ALGO_X11EVO,
ALGO_X11,
ALGO_X13,
@ -101,6 +102,7 @@ static const char *algo_names[] = { @@ -101,6 +102,7 @@ static const char *algo_names[] = {
"skein",
"skein2",
"s3",
"timetravel",
"x11evo",
"x11",
"x13",

1
bench.cpp

@ -92,6 +92,7 @@ void algo_free_all(int thr_id) @@ -92,6 +92,7 @@ void algo_free_all(int thr_id)
//free_sha256d(thr_id);
free_scrypt(thr_id);
free_scrypt_jane(thr_id);
free_timetravel(thr_id);
}
// benchmark all algos (called once per mining thread)

6
ccminer.cpp

@ -261,6 +261,7 @@ Options:\n\ @@ -261,6 +261,7 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
s3 S3 (1Coin)\n\
timetravel Machinecoin permuted x8\n\
vanilla Blake256-8 (VNL)\n\
veltor Thorsriddle streebog\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
@ -1619,6 +1620,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1619,6 +1620,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LBRY:
case ALGO_LYRA2v2:
case ALGO_LYRA2Z:
case ALGO_TIMETRAVEL:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break;
case ALGO_KECCAK:
@ -2121,6 +2123,7 @@ static void *miner_thread(void *userdata) @@ -2121,6 +2123,7 @@ static void *miner_thread(void *userdata)
case ALGO_HEAVY:
case ALGO_LYRA2v2:
case ALGO_S3:
case ALGO_TIMETRAVEL:
case ALGO_X11EVO:
case ALGO_X11:
case ALGO_X13:
@ -2333,6 +2336,9 @@ static void *miner_thread(void *userdata) @@ -2333,6 +2336,9 @@ static void *miner_thread(void *userdata)
case ALGO_WILDKECCAK:
rc = scanhash_wildkeccak(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_TIMETRAVEL:
rc = scanhash_timetravel(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X11EVO:
rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
break;

1
ccminer.vcxproj

@ -539,6 +539,7 @@ @@ -539,6 +539,7 @@
<CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" />
<CudaCompile Include="x11\veltor.cu" />
<CudaCompile Include="x11\x11.cu" />
<CudaCompile Include="x11\x11evo.cu" />

3
ccminer.vcxproj.filters

@ -739,6 +739,9 @@ @@ -739,6 +739,9 @@
<CudaCompile Include="x11\s3.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\timetravel.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\veltor.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>

3
miner.h

@ -306,6 +306,7 @@ extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsig @@ -306,6 +306,7 @@ extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsig
extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -360,6 +361,7 @@ extern void free_sib(int thr_id); @@ -360,6 +361,7 @@ extern void free_sib(int thr_id);
extern void free_skeincoin(int thr_id);
extern void free_skein2(int thr_id);
extern void free_s3(int thr_id);
extern void free_timetravel(int thr_id);
extern void free_vanilla(int thr_id);
extern void free_veltor(int thr_id);
extern void free_whirl(int thr_id);
@ -882,6 +884,7 @@ void sibhash(void *output, const void *input); @@ -882,6 +884,7 @@ void sibhash(void *output, const void *input);
void skeincoinhash(void *output, const void *input);
void skein2hash(void *output, const void *input);
void s3hash(void *output, const void *input);
void timetravel_hash(void *output, const void *input);
void veltorhash(void *output, const void *input);
void wcoinhash(void *state, const void *input);
void whirlxHash(void *state, const void *input);

83
quark/cuda_jh512.cu

@ -334,3 +334,86 @@ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, @@ -334,3 +334,86 @@ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
// Setup function
__host__ void quark_jh512_cpu_init(int thr_id, uint32_t threads) {}
#define WANT_JH80
#ifdef WANT_JH80
__constant__
static uint32_t c_PaddedMessage80[20]; // padded message (80 bytes)
__host__
void jh512_setBlock_80(int thr_id, uint32_t *endiandata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
__global__
void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t h[20];
AS_UINT4(&h[ 0]) = AS_UINT4(&c_PaddedMessage80[ 0]);
AS_UINT4(&h[ 4]) = AS_UINT4(&c_PaddedMessage80[ 4]);
AS_UINT4(&h[ 8]) = AS_UINT4(&c_PaddedMessage80[ 8]);
AS_UINT4(&h[12]) = AS_UINT4(&c_PaddedMessage80[12]);
AS_UINT2(&h[16]) = AS_UINT2(&c_PaddedMessage80[16]);
h[18] = c_PaddedMessage80[18];
h[19] = cuda_swab32(startNounce + thread);
uint32_t x[8][4] = { /* init */
{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc }
};
// 1 (could be precomputed)
#pragma unroll
for (int i = 0; i < 16; i++)
x[i/4][i & 3] ^= h[i];
E8(x);
#pragma unroll
for (int i = 0; i < 16; i++)
x[(i+16)/4][(i+16) & 3] ^= h[i];
// 2 (16 bytes with nonce)
#pragma unroll
for (int i = 0; i < 4; i++)
x[0][i] ^= h[16+i];
x[1][0] ^= 0x80U;
E8(x);
#pragma unroll
for (int i = 0; i < 4; i++)
x[4][i] ^= h[16+i];
x[5][0] ^= 0x80U;
// 3 close
x[3][3] ^= 0x80020000U; // 80 bytes = 640bits (0x280)
E8(x);
x[7][3] ^= 0x80020000U;
uint32_t *Hash = &g_outhash[(size_t)16 * thread];
AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
}
}
__host__
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
}
#endif

95
quark/cuda_quark_groestl512.cu

@ -18,6 +18,11 @@ @@ -18,6 +18,11 @@
#include "groestl_transf_quad.h"
#endif
#define WANT_GROESTL80
#ifdef WANT_GROESTL80
__constant__ static uint32_t c_Message80[20];
#endif
#include "cuda_quark_groestl512_sm2.cuh"
__global__ __launch_bounds__(TPB, THF)
@ -114,3 +119,93 @@ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNo @@ -114,3 +119,93 @@ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNo
quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
}
// --------------------------------------------------------------------------------------------------------------------------------------------
#ifdef WANT_GROESTL80
__host__
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata)
{
cudaMemcpyToSymbol(c_Message80, endiandata, sizeof(c_Message80), 0, cudaMemcpyHostToDevice);
}
__global__ __launch_bounds__(TPB, THF)
void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
{
#if __CUDA_ARCH__ >= 300
// BEWARE : 4-WAY CODE (one hash need 4 threads)
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread < threads)
{
const uint32_t thr = threadIdx.x & 0x3; // % THF
/*| M0 M1 M2 M3 M4 | M5 M6 M7 | (input)
--|----------------|----------|
T0| 0 4 8 12 16 | 80 |
T1| 1 5 17 | |
T2| 2 6 18 | |
T3| 3 7 Nc | 01 |
--|----------------|----------| TPR */
uint32_t message[8];
#pragma unroll 5
for(int k=0; k<5; k++) message[k] = c_Message80[thr + (k * THF)];
#pragma unroll 3
for(int k=5; k<8; k++) message[k] = 0;
if (thr == 0) message[5] = 0x80U;
if (thr == 3) {
message[4] = cuda_swab32(startNounce + thread);
message[7] = 0x01000000U;
}
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
uint32_t state[8];
groestl512_progressMessage_quad(state, msgBitsliced);
uint32_t hash[16];
from_bitslice_quad(state, hash);
if (thr == 0) { /* 4 threads were done */
const off_t hashPosition = thread;
//if (!thread) hash[15] = 0xFFFFFFFF;
uint4 *outpt = (uint4*) &g_outhash[hashPosition << 4];
uint4 *phash = (uint4*) hash;
outpt[0] = phash[0];
outpt[1] = phash[1];
outpt[2] = phash[2];
outpt[3] = phash[3];
}
}
#endif
}
__host__
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
const uint32_t threadsperblock = TPB;
const uint32_t factor = THF;
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
groestl512_gpu_hash_80_quad <<<grid, block>>> (threads, startNounce, d_hash);
} else {
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
groestl512_gpu_hash_80_sm2 <<<grid, block>>> (threads, startNounce, d_hash);
}
}
#endif

92
quark/cuda_quark_groestl512_sm2.cuh

@ -223,6 +223,7 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32 @@ -223,6 +223,7 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
__host__
void quark_groestl512_sm20_init(int thr_id, uint32_t threads)
{
// Texturen mit obigem Makro initialisieren
texDef(0, t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(1, t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(2, t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
@ -265,3 +266,94 @@ void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t @@ -265,3 +266,94 @@ void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t
quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
}
// --------------------------------------------------------------------------------------------------------------------------------------------
#ifdef WANT_GROESTL80
// defined in groest512.cu
// __constant__ static uint32_t c_Message80[20];
__global__
//__launch_bounds__(256)
void groestl512_gpu_hash_80_sm2(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
{
#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
#if USE_SHARED
__shared__ char mixtabs[8 * 1024];
if (threadIdx.x < 256) {
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + ( 256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + ( 512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + ( 768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t message[32];
#pragma unroll 5
for (int i=0; i < 20; i += 4)
AS_UINT4(&message[i]) = AS_UINT4(&c_Message80[i]);
message[19] = cuda_swab32(startNounce + thread);
message[20] = 0x80U; // end tag
#pragma unroll
for(int i=21; i<31; i++) message[i] = 0U;
message[31] = 0x01000000U; // end block
uint32_t state[32];
#pragma unroll
for(int i=0; i<32; i++) state[i] = message[i];
state[31] ^= 0x00020000U; // "...00000201"
#if USE_SHARED
quark_groestl512_perm_P(state, mixtabs);
quark_groestl512_perm_Q(message, mixtabs);
state[31] ^= 0x00020000U;
#pragma unroll 32
for(int i=0; i<32; i++) state[i] ^= message[i];
#pragma unroll 16
for(int i=16; i<32; i++) message[i] = state[i];
quark_groestl512_perm_P(state, mixtabs);
#else
tex_groestl512_perm_P(state);
tex_groestl512_perm_Q(message);
state[31] ^= 0x00020000U;
#pragma unroll 32
for(int i=0; i<32; i++) state[i] ^= message[i];
#pragma unroll 16
for(int i=16; i<32; i++) message[i] = state[i];
tex_groestl512_perm_P(state);
#endif
#pragma unroll 16
for(int i=16; i<32; i++) state[i] ^= message[i];
// uint4 = 4 x uint32_t = 16 bytes, x 4 => 64 bytes
const off_t hashPosition = thread;
uint4 *outpt = (uint4*) (&g_outhash[hashPosition << 4]);
uint4 *phash = (uint4*) (&state[16]);
outpt[0] = phash[0];
outpt[1] = phash[1];
outpt[2] = phash[2];
outpt[3] = phash[3];
}
#endif
}
#endif // WANT_GROESTL80

35
quark/cuda_quark_keccak512.cu

@ -232,15 +232,6 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin @@ -232,15 +232,6 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin
}
}
__host__
void quark_keccak512_cpu_init(int thr_id, uint32_t threads)
{
cudaMemcpyToSymbol( d_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
}
__host__
void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
@ -258,3 +249,29 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou @@ -258,3 +249,29 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
MyStreamSynchronize(NULL, order, thr_id);
}
void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
__host__
void quark_keccak512_cpu_init(int thr_id, uint32_t threads)
{
// required for the 64 bytes one
cudaMemcpyToSymbol(d_keccak_round_constants, host_keccak_round_constants,
sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice);
jackpot_keccak512_cpu_init(thr_id, threads);
}
__host__
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata)
{
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
}
__host__
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
jackpot_keccak512_cpu_hash(thr_id, threads, startNounce, d_hash, 0);
}

2
qubit/qubit_luffa512.cu

@ -469,7 +469,7 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun @@ -469,7 +469,7 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
}
__host__
void qubit_cpu_precalc(uint32_t* message)
static void qubit_cpu_precalc(uint32_t* message)
{
uint32_t statebuffer[8];
uint32_t statechainv[40] =

3
util.cpp

@ -2243,6 +2243,9 @@ void print_hash_tests(void) @@ -2243,6 +2243,9 @@ void print_hash_tests(void)
s3hash(&hash[0], &buf[0]);
printpfx("S3", hash);
timetravel_hash(&hash[0], &buf[0]);
printpfx("timetravel", hash);
blake256hash(&hash[0], &buf[0], 8);
printpfx("vanilla", hash);

121
x11/cuda_x11_cubehash512.cu

@ -254,58 +254,113 @@ static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) @@ -254,58 +254,113 @@ static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
/***************************************************/
// GPU Hash Function
__global__
void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
uint32_t x[2][2][2][2][2];
Init(x);
uint32_t x[2][2][2][2][2];
Init(x);
// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);
// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);
// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
Final(x, (BitSequence*)Hash);
}
Final(x, (BitSequence*)Hash);
}
}
// Setup-Funktionen
__host__
void x11_cubehash512_cpu_init(int thr_id, uint32_t threads)
void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
}
__host__
void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
/***************************************************/
#define WANT_CUBEHASH80
#ifdef WANT_CUBEHASH80
__constant__
static uint32_t c_PaddedMessage80[20];
__host__
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
{
const uint32_t threadsperblock = 256;
cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
__global__
void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNounce + thread;
uint32_t x[2][2][2][2][2];
Init(x);
uint32_t message[8];
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
Update32(x, (const BitSequence*)message);
// second 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
Update32(x, (const BitSequence*)message);
// last 16 bytes + Padding
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
message[3] = cuda_swab32(nonce);
message[4] = 0x80;
message[5] = 0;
message[6] = 0;
message[7] = 0;
Update32(x, (const BitSequence*)message);
BitSequence* output = (BitSequence*) (&g_outhash[(size_t)8 * thread]);
Final(x, output);
}
}
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
__host__
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
//MyStreamSynchronize(NULL, order, thr_id);
cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
}
#endif

554
x11/timetravel.cu

@ -0,0 +1,554 @@ @@ -0,0 +1,554 @@
/**
* Timetravel CUDA implementation
* by tpruvot@github - March 2017
*/
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
#define HASH_FUNC_BASE_TIMESTAMP 1389040865U // Machinecoin Genesis Timestamp
#define HASH_FUNC_COUNT 8
#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#if HASH_FUNC_COUNT > 8
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#endif
}
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x11.h"
static uint32_t *d_hash[MAX_GPUS];
enum Algo {
BLAKE = 0,
BMW,
GROESTL,
SKEIN,
JH,
KECCAK,
LUFFA,
CUBEHASH,
#if HASH_FUNC_COUNT > 8
SHAVITE,
SIMD,
ECHO,
#endif
MAX_ALGOS_COUNT
};
static const char* algo_strings[] = {
"blake",
"bmw512",
"groestl",
"skein",
"jh512",
"keccak",
"luffa",
"cube",
NULL
};
inline void swap8(uint8_t *a, uint8_t *b)
{
uint8_t t = *a;
*a = *b;
*b = t;
}
inline void initPerm(uint8_t n[], int count)
{
for (int i = 0; i < count; i++)
n[i] = i;
}
static int nextPerm(uint8_t n[], int count)
{
int tail, i, j;
if (count <= 1)
return 0;
for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
tail = i;
if (tail > 0) {
for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
swap8(&n[tail - 1], &n[j]);
}
for (i = tail, j = count - 1; i<j; i++, j--)
swap8(&n[i], &n[j]);
return (tail != 0);
}
static void getAlgoString(char *str, int seq)
{
uint8_t algoList[HASH_FUNC_COUNT];
char *sptr;
initPerm(algoList, HASH_FUNC_COUNT);
for (int k = 0; k < seq; k++) {
nextPerm(algoList, HASH_FUNC_COUNT);
}
sptr = str;
for (int j = 0; j < HASH_FUNC_COUNT; j++) {
if (algoList[j] >= 10)
sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
else
sprintf(sptr, "%u", (uint32_t) algoList[j]);
sptr++;
}
*sptr = '\0';
}
static __thread uint32_t s_ntime = 0;
static uint32_t s_sequence = UINT32_MAX;
static uint8_t s_firstalgo = 0xFF;
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
{
// unlike x11evo, the permutation changes often (with ntime)
return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
}
// To finish...
static void get_travel_order(uint32_t ntime, char *permstr)
{
uint32_t seq = getCurrentAlgoSeq(ntime);
if (s_sequence != seq) {
getAlgoString(permstr, seq);
s_sequence = seq;
}
}
// CPU Hash
extern "C" void timetravel_hash(void *output, const void *input)
{
uint32_t _ALIGN(64) hash[64/4] = { 0 };
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_skein512_context ctx_skein;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_luffa512_context ctx_luffa1;
sph_cubehash512_context ctx_cubehash1;
#if HASH_FUNC_COUNT > 8
sph_shavite512_context ctx_shavite1;
sph_simd512_context ctx_simd1;
sph_echo512_context ctx_echo1;
#endif
if (s_sequence == UINT32_MAX) {
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
get_travel_order(ntime, hashOrder);
}
void *in = (void*) input;
int size = 80;
const int hashes = (int) strlen(hashOrder);
for (int i = 0; i < hashes; i++)
{
const char elem = hashOrder[i];
uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
if (i > 0) {
in = (void*) hash;
size = 64;
}
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, in, size);
sph_blake512_close(&ctx_blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, in, size);
sph_bmw512_close(&ctx_bmw, hash);
break;
case GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
//applog_hex((void*)hash, 32);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, in, size);
sph_skein512_close(&ctx_skein, hash);
break;
case JH:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, in, size);
sph_jh512_close(&ctx_jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, in, size);
sph_keccak512_close(&ctx_keccak, hash);
break;
case LUFFA:
sph_luffa512_init(&ctx_luffa1);
sph_luffa512(&ctx_luffa1, in, size);
sph_luffa512_close(&ctx_luffa1, hash);
break;
case CUBEHASH:
sph_cubehash512_init(&ctx_cubehash1);
sph_cubehash512(&ctx_cubehash1, in, size);
sph_cubehash512_close(&ctx_cubehash1, hash);
break;
#if HASH_FUNC_COUNT > 8
case SHAVITE:
sph_shavite512_init(&ctx_shavite1);
sph_shavite512(&ctx_shavite1, in, size);
sph_shavite512_close(&ctx_shavite1, hash);
break;
case SIMD:
sph_simd512_init(&ctx_simd1);
sph_simd512(&ctx_simd1, in, size);
sph_simd512_close(&ctx_simd1, hash);
break;
case ECHO:
sph_echo512_init(&ctx_echo1);
sph_echo512(&ctx_echo1, in, size);
sph_echo512_close(&ctx_echo1, hash);
break;
#endif
}
}
memcpy(output, hash, 32);
}
static uint32_t get_next_time(uint32_t ntime, char* curOrder)
{
char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
uint32_t secs = 15;
do {
uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
getAlgoString(nextOrder, nseq);
secs += 15;
} while (curOrder[0] == nextOrder[0]);
return secs;
}
//#define _DEBUG
#define _DEBUG_PREFIX "tt-"
#include "cuda_debug.cuh"
void quark_bmw512_cpu_setBlock_80(void *pdata);
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void skein512_cpu_setBlock_80(void *pdata);
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
void qubit_luffa512_cpu_setBlock_80(void *pdata);
void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
uint32_t ntime = swab32(work->data[17]);
get_travel_order(ntime, hashOrder);
s_ntime = pdata[17];
if (opt_debug && !thr_id) {
applog(LOG_DEBUG, "timetravel hash order %s (%08x)", hashOrder, ntime);
}
}
if (opt_benchmark)
ptarget[7] = 0x5;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
x11_luffa512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput);
#if HASH_FUNC_COUNT > 8
x11_shavite512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
#endif
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
cuda_check_cpu_setTarget(ptarget);
const int hashes = (int) strlen(hashOrder);
const char first = hashOrder[0];
const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
if (algo80 != s_firstalgo) {
s_firstalgo = algo80;
applog(LOG_INFO, "Timetravel first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
}
switch (algo80) {
case BLAKE:
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
break;
case BMW:
quark_bmw512_cpu_setBlock_80(endiandata);
break;
case GROESTL:
groestl512_setBlock_80(thr_id, endiandata);
break;
case SKEIN:
skein512_cpu_setBlock_80((void*)endiandata);
break;
case JH:
jh512_setBlock_80(thr_id, endiandata);
break;
case KECCAK:
keccak512_setBlock_80(thr_id, endiandata);
break;
case LUFFA:
qubit_luffa512_cpu_setBlock_80((void*)endiandata);
break;
case CUBEHASH:
cubehash512_setBlock_80(thr_id, endiandata);
break;
default: {
uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
if (!thr_id)
applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
sleep(next > 30 ? 60 : 10);
return -1;
}
}
do {
int order = 0;
// Hash with CUDA
switch (algo80) {
case BLAKE:
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake80:");
break;
case BMW:
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("bmw80 :");
break;
case GROESTL:
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("grstl80:");
break;
case SKEIN:
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("skein80:");
break;
case JH:
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("jh51280:");
break;
case KECCAK:
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("kecck80:");
break;
case LUFFA:
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("luffa80:");
break;
case CUBEHASH:
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("cube 80:");
break;
}
for (int i = 1; i < hashes; i++)
{
const char elem = hashOrder[i];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo64) {
case BLAKE:
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("blake :");
break;
case BMW:
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("bmw :");
break;
case GROESTL:
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("groestl:");
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("skein :");
break;
case JH:
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("jh512 :");
break;
case KECCAK:
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
break;
case LUFFA:
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("luffa :");
break;
case CUBEHASH:
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
#if HASH_FUNC_COUNT > 8
case SHAVITE:
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
break;
case ECHO:
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("echo :");
break;
#endif
}
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
const uint32_t Htarg = ptarget[7];
be32enc(&endiandata[19], work->nonces[0]);
timetravel_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
pdata[19] = work->nonces[0];
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
timetravel_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
}
pdata[19] = max(pdata[19], work->nonces[1]) + 1;
}
return work->valid_nonces;
} else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_timetravel(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
#if HASH_FUNC_COUNT > 8
x11_simd512_cpu_free(thr_id);
#endif
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}
Loading…
Cancel
Save