Browse Source

Add zr5 algo (for SM 3.5+)

uint4 copy + keccak cleanup, groestl: small uint4 opt

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
master
Tanguy Pruvot 10 years ago
parent
commit
a37e909db9
  1. 281
      JHA/cuda_jha_keccak512.cu
  2. 2
      Makefile.am
  3. 12
      README.txt
  4. 74
      ccminer.cpp
  5. 12
      ccminer.vcxproj
  6. 9
      ccminer.vcxproj.filters
  7. 2
      configure.ac
  8. 30
      cpuminer-config.h
  9. 5
      miner.h
  10. 15
      quark/cuda_quark_groestl512.cu
  11. 39
      util.cpp
  12. 342
      zr5.cu

281
JHA/cuda_jha_keccak512.cu

@ -3,8 +3,14 @@
#include "cuda_helper.h" #include "cuda_helper.h"
__constant__ uint64_t c_State[25]; // ZR5
__constant__ uint32_t d_OriginalData[20];
__constant__ uint32_t c_PaddedMessage[18]; __constant__ uint32_t c_PaddedMessage[18];
__constant__ uint64_t c_State[25];
#define POK_DATA_MASK 0xFFFF0000
#define POK_VERSION 0x1
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@ -35,7 +41,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
/* absorb input */ /* absorb input */
#pragma unroll 9 #pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2) for (i = 0; i < 72 / 8; i++, in += 2)
s[i] ^= U32TO64_LE(in); s[i] ^= U32TO64_LE(in);
@ -100,52 +106,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
} }
} }
__global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
int hashPosition = nounce - startNounce;
// Nachricht kopieren
uint32_t message[18];
#pragma unroll 18
for(int i=0;i<18;i++)
message[i] = c_PaddedMessage[i];
// die individuelle Nounce einsetzen
message[1] = cuda_swab32(nounce);
// State initialisieren
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
// den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
// das Hash erzeugen
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}
// fertig
uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 16
for(int i=0;i<16;i++)
outpHash[i] = hash[i];
}
}
// Setup-Funktionen // Setup-Funktionen
__host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) __host__
void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants, cudaMemcpyToSymbol( c_keccak_round_constants,
@ -174,10 +137,8 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
#else #else
#define ROL_mult8(a, offset) ROL(a, offset) #define ROL_mult8(a, offset) ROL(a, offset)
#endif #endif
void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount );
const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = {
{
(tKeccakLane)0x0000000000000001ULL, (tKeccakLane)0x0000000000000001ULL,
(tKeccakLane)0x0000000000008082ULL, (tKeccakLane)0x0000000000008082ULL,
(tKeccakLane)0x800000000000808aULL, (tKeccakLane)0x800000000000808aULL,
@ -196,29 +157,25 @@ const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] =
(tKeccakLane)0x8000000000008003ULL, (tKeccakLane)0x8000000000008003ULL,
(tKeccakLane)0x8000000000008002ULL, (tKeccakLane)0x8000000000008002ULL,
(tKeccakLane)0x8000000000000080ULL (tKeccakLane)0x8000000000000080ULL
#if (cKeccakB >= 400) #if (cKeccakB >= 400)
, (tKeccakLane)0x000000000000800aULL, , (tKeccakLane)0x000000000000800aULL,
(tKeccakLane)0x800000008000000aULL (tKeccakLane)0x800000008000000aULL
#if (cKeccakB >= 800) #if (cKeccakB >= 800)
, (tKeccakLane)0x8000000080008081ULL, , (tKeccakLane)0x8000000080008081ULL,
(tKeccakLane)0x8000000000008080ULL (tKeccakLane)0x8000000000008080ULL
#if (cKeccakB == 1600) #if (cKeccakB == 1600)
, (tKeccakLane)0x0000000080000001ULL, , (tKeccakLane)0x0000000080000001ULL,
(tKeccakLane)0x8000000080008008ULL (tKeccakLane)0x8000000080008008ULL
#endif #endif
#endif #endif
#endif #endif
}; };
void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) void KeccakF(tKeccakLane * state, const tKeccakLane *in, int laneCount)
{ {
while ( --laneCount >= 0 ) {
{
while ( --laneCount >= 0 )
{
state[laneCount] ^= in[laneCount]; state[laneCount] ^= in[laneCount];
} }
}
{ {
tKeccakLane Aba, Abe, Abi, Abo, Abu; tKeccakLane Aba, Abe, Abi, Abo, Abu;
@ -487,7 +444,8 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
} }
// inlen kann 72...143 betragen // inlen kann 72...143 betragen
__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) __host__
void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
{ {
const unsigned char *in = (const unsigned char*)pdata; const unsigned char *in = (const unsigned char*)pdata;
@ -496,43 +454,218 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
memset( state, 0, sizeof(state) ); memset( state, 0, sizeof(state) );
for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes ) for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes)
{ {
KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
} }
// Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) // Copy state of the first round (72 Bytes)
// ins Constant Memory // in Constant Memory
cudaMemcpyToSymbol( c_State, cudaMemcpyToSymbol( c_State,
state, state,
sizeof(state), sizeof(state),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
// padding // second part
memcpy( temp, in, (size_t)inlen ); memcpy(temp, in, inlen);
temp[inlen++] = 1; temp[inlen++] = 1;
memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen ); memset(temp + inlen, 0, cKeccakR_SizeInBytes - inlen);
temp[cKeccakR_SizeInBytes-1] |= 0x80; temp[cKeccakR_SizeInBytes-1] |= 0x80;
// Copy rest of the message in constant memory
// Kopiere den Rest der Message und das Padding ins Constant Memory
cudaMemcpyToSymbol( c_PaddedMessage, cudaMemcpyToSymbol( c_PaddedMessage,
temp, temp,
cKeccakR_SizeInBytes, cKeccakR_SizeInBytes,
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
} }
__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) __global__
void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
int hashPosition = nounce - startNounce;
uint32_t message[18];
#pragma unroll 18
for(int i=0;i<18;i++)
message[i] = c_PaddedMessage[i];
message[1] = cuda_swab32(nounce);
// State init
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
// den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}
// copy hash
uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 16
for(int i=0;i<16;i++)
outpHash[i] = hash[i];
}
}
__host__
void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
{ {
const uint32_t threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash); jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
/* zr5 keccak, no nonce swab32 */
__global__
void zr5_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint32_t message[18];
#pragma unroll 18
for(int i=0; i<18; i++)
message[i] = c_PaddedMessage[i];
message[1] = nounce;
// Get mid-state
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (int i = 0; i < 8; i++) {
U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
}
// Output (64 bytes hash required)
uint32_t hashPosition = nounce - startNounce;
//uint32_t *outpHash = (uint32_t*) (&g_hash[hashPosition*8]);
//#pragma unroll 16
//for(int i=0; i<16; i++)
// outpHash[i] = hash[i];
uint4 *outpHash = (uint4*) (&g_hash[hashPosition*8]);
uint4 *psrc = (uint4*) hash;
outpHash[0] = psrc[0];
outpHash[1] = psrc[1];
outpHash[2] = psrc[2];
outpHash[3] = psrc[3];
}
}
__host__
void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
zr5_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, 0, thr_id);
}
/* required for the second hash part of zr5 */
__global__
void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_pokh, uint32_t version)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
//uint32_t hashPosition = thread * 16;
uint32_t *prevHash = &g_hash[thread * 16]; // thread * 64 / sizeof(uint32_t)
uint32_t message[18]; /* 72 bytes */
// pok - hash[0] from prev hash
message[0] = version | (prevHash[0] & POK_DATA_MASK);
// save pok
d_pokh[thread] = (uint16_t) (message[0] / 0x10000);
for (int i=1; i<18; i++) {
message[i]=d_OriginalData[i];
}
// first bloc
uint64_t keccak_gpu_state[25] = { 0 };
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
// second bloc
message[0] = d_OriginalData[18];
message[1] = nounce; //cuda_swab32(nounce);
message[2] = 1;
#pragma unroll
for(int i=3; i<17; i++)
message[i] = 0;
message[17] = 0x80000000UL;
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 8; i++) {
U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
}
//uint32_t *outpHash = &g_hash[thread * 16];
//#pragma unroll 16
//for(int i=0; i<16; i++)
// outpHash[i] = hash[i];
uint4 *outpHash = (uint4*) (&g_hash[thread * 16]);
uint4 *psrc = (uint4*) hash;
outpHash[0] = psrc[0];
outpHash[1] = psrc[1];
outpHash[2] = psrc[2];
outpHash[3] = psrc[3];
}
}
__host__
void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks)
{
const uint32_t threadsperblock = 256;
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice);
zr5_keccak512_gpu_hash_pok<<<grid, block>>>(threads, startNounce, d_hash, d_poks, version);
MyStreamSynchronize(NULL, 10, thr_id);
}

2
Makefile.am

@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
quark/quarkcoin.cu quark/animecoin.cu \ quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \ quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu pentablake.cu \ cuda_nist5.cu pentablake.cu zr5.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \

12
README.txt

@ -1,5 +1,5 @@
ccMiner release 1.6.0-tpruvot (Mar 2015) - "Pluck & Whirlpoolx" ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds)
Keccak (Maxcoin) Keccak (Maxcoin)
Deep, Doom and Qubit Deep, Doom and Qubit
Pentablake (Blake 512 x5) Pentablake (Blake 512 x5)
S3 (OneCoin) 1Coin Triple S
Lyra2RE (new VertCoin algo) Vertcoin Lyra2RE
Ziftrcoin (ZR5)
where some of these coins have a VERY NOTABLE nVidia advantage where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL Only) implementations. over competing AMD (OpenCL Only) implementations.
@ -84,6 +85,7 @@ its command line interface and options.
x14 use to mine X14Coin x14 use to mine X14Coin
x15 use to mine Halcyon x15 use to mine Halcyon
x17 use to mine X17 x17 use to mine X17
zr5 use to mine ZiftrCoin
-d, --devices gives a comma separated list of CUDA device IDs -d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0! to operate on. Device IDs start counting from 0!
@ -183,12 +185,12 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Mar. 2015 v1.6.0 (Note for CryptoMiningBlog: NOT YET RELEASED/FINISHED!) Mar. 27th 2015 v1.6.0
Add the ZR5 Algo for Ziftcoin
Import pluck (djm34) and whirlpoolx (alexis78) algos Import pluck (djm34) and whirlpoolx (alexis78) algos
Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs) Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
Default config file (also help to debug without command line) Default config file (also help to debug without command line)
Various small fixes Various small fixes
More to come soon...
Feb. 11th 2015 v1.5.3 Feb. 11th 2015 v1.5.3
Fix anime algo Fix anime algo

74
ccminer.cpp

@ -108,6 +108,7 @@ enum sha_algos {
ALGO_X14, ALGO_X14,
ALGO_X15, ALGO_X15,
ALGO_X17, ALGO_X17,
ALGO_ZR5,
}; };
static const char *algo_names[] = { static const char *algo_names[] = {
@ -140,6 +141,7 @@ static const char *algo_names[] = {
"x14", "x14",
"x15", "x15",
"x17", "x17",
"zr5",
}; };
bool opt_debug = false; bool opt_debug = false;
@ -166,7 +168,7 @@ static const bool opt_time = true;
static enum sha_algos opt_algo = ALGO_X11; static enum sha_algos opt_algo = ALGO_X11;
int opt_n_threads = 0; int opt_n_threads = 0;
int opt_affinity = -1; int opt_affinity = -1;
int opt_priority = 3; int opt_priority = 0;
static double opt_difficulty = 1; // CH static double opt_difficulty = 1; // CH
bool opt_trust_pool = false; bool opt_trust_pool = false;
uint16_t opt_vote = 9999; uint16_t opt_vote = 9999;
@ -193,6 +195,7 @@ int api_thr_id = -1;
bool stratum_need_reset = false; bool stratum_need_reset = false;
struct work_restart *work_restart = NULL; struct work_restart *work_restart = NULL;
struct stratum_ctx stratum = { 0 }; struct stratum_ctx stratum = { 0 };
uint32_t zr5_pok = 0;
pthread_mutex_t applog_lock; pthread_mutex_t applog_lock;
static pthread_mutex_t stats_lock; static pthread_mutex_t stats_lock;
@ -254,6 +257,7 @@ Options:\n\
x17 X17 (peoplecurrency)\n\ x17 X17 (peoplecurrency)\n\
whirl Whirlcoin (old whirlpool)\n\ whirl Whirlcoin (old whirlpool)\n\
whirlpoolx Vanilla coin\n\ whirlpoolx Vanilla coin\n\
zr5 ZR5 (ZiftrCoin)\n\
-d, --devices Comma separated list of CUDA devices to use.\n\ -d, --devices Comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\ Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\ string names of your cards like gtx780ti or gt640#2\n\
@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work)
int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target); int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
int i; int i;
if (opt_algo == ALGO_ZR5) {
data_size = 80; adata_sz = 20;
}
if (unlikely(!jobj_binary(val, "data", work->data, data_size))) { if (unlikely(!jobj_binary(val, "data", work->data, data_size))) {
applog(LOG_ERR, "JSON inval data"); applog(LOG_ERR, "JSON inval data");
return false; return false;
@ -564,12 +572,12 @@ static int share_result(int result, const char *reason)
if (reason) { if (reason) {
applog(LOG_WARNING, "reject reason: %s", reason); applog(LOG_WARNING, "reject reason: %s", reason);
if (strncmp(reason, "low difficulty share", 20) == 0) { if (strncasecmp(reason, "low difficulty", 14) == 0) {
opt_difficulty = (opt_difficulty * 2.0) / 3.0; opt_difficulty = (opt_difficulty * 2.0) / 3.0;
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty); applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
return 0; return 0;
} }
if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) { if (strncasecmp(reason, "duplicate", 9) == 0 && !check_dups) {
applog(LOG_WARNING, "enabling duplicates check feature"); applog(LOG_WARNING, "enabling duplicates check feature");
check_dups = true; check_dups = true;
} }
@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
} }
} }
if (stale_work) { if (opt_algo == ALGO_ZR5 && !stale_work) {
stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
}
if (!submit_old && stale_work) {
if (opt_debug) if (opt_debug)
applog(LOG_WARNING, "stale work detected, discarding"); applog(LOG_WARNING, "stale work detected, discarding");
return true; return true;
@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
uint16_t nvote; uint16_t nvote;
char *ntimestr, *noncestr, *xnonce2str, *nvotestr; char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
switch (opt_algo) {
case ALGO_ZR5:
check_dups = true;
be32enc(&ntime, work->data[17]);
be32enc(&nonce, work->data[19]);
break;
default:
le32enc(&ntime, work->data[17]); le32enc(&ntime, work->data[17]);
le32enc(&nonce, work->data[19]); le32enc(&nonce, work->data[19]);
}
noncestr = bin2hex((const uchar*)(&nonce), 4); noncestr = bin2hex((const uchar*)(&nonce), 4);
if (check_dups) if (check_dups)
@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
} else { } else {
int data_size = sizeof(work->data);
int adata_sz = ARRAY_SIZE(work->data);
/* build hex string */ /* build hex string */
char *str = NULL; char *str = NULL;
if (opt_algo == ALGO_ZR5) {
data_size = 80; adata_sz = 20;
}
if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) { if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
for (int i = 0; i < ARRAY_SIZE(work->data); i++) for (int i = 0; i < adata_sz; i++)
le32enc(work->data + i, work->data[i]); le32enc(work->data + i, work->data[i]);
} }
str = bin2hex((uchar*)work->data, sizeof(work->data)); str = bin2hex((uchar*)work->data, data_size);
if (unlikely(!str)) { if (unlikely(!str)) {
applog(LOG_ERR, "submit_upstream_work OOM"); applog(LOG_ERR, "submit_upstream_work OOM");
return false; return false;
@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
work->data[17] = le32dec(sctx->job.ntime); work->data[17] = le32dec(sctx->job.ntime);
work->data[18] = le32dec(sctx->job.nbits); work->data[18] = le32dec(sctx->job.nbits);
if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY)
{ switch (opt_algo) {
case ALGO_MJOLLNIR:
case ALGO_HEAVY:
// todo: check if 19 is enough
for (i = 0; i < 20; i++) for (i = 0; i < 20; i++)
work->data[i] = be32dec((uint32_t *)&work->data[i]); work->data[i] = be32dec((uint32_t *)&work->data[i]);
break;
case ALGO_ZR5:
for (i = 0; i < 19; i++)
work->data[i] = be32dec((uint32_t *)&work->data[i]);
break;
} }
work->data[20] = 0x80000000; work->data[20] = 0x80000000;
@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata)
// &work.data[19] // &work.data[19]
int wcmplen = 76; int wcmplen = 76;
int wcmpoft = 0;
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
if (have_stratum) { if (have_stratum) {
@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata)
hashlog_purge_job(work.job_id); hashlog_purge_job(work.job_id);
} }
} }
if (memcmp(work.data, g_work.data, wcmplen)) {
if (opt_algo == ALGO_ZR5) {
// ignore pok/version header
wcmpoft = 1;
wcmplen -= 4;
}
if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
#if 0 #if 0
if (opt_debug) { if (opt_debug) {
for (int n=0; n <= (wcmplen-8); n+=8) { for (int n=0; n <= (wcmplen-8); n+=8) {
@ -1510,6 +1552,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_ZR5: {
rc = scanhash_zr5(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
}
default: default:
/* should never happen */ /* should never happen */
goto out; goto out;
@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata)
if (rc > 1 && work.data[21]) { if (rc > 1 && work.data[21]) {
work.data[19] = work.data[21]; work.data[19] = work.data[21];
work.data[21] = 0; work.data[21] = 0;
if (opt_algo == ALGO_ZR5) {
// todo: use + 4..6 index for pok to allow multiple nonces
work.data[0] = work.data[22]; // pok
work.data[22] = 0;
}
if (!submit_work(mythr, &work)) if (!submit_work(mythr, &work))
break; break;
} }
@ -1675,10 +1727,10 @@ start:
submit_old = soval ? json_is_true(soval) : false; submit_old = soval ? json_is_true(soval) : false;
pthread_mutex_lock(&g_work_lock); pthread_mutex_lock(&g_work_lock);
if (work_decode(json_object_get(val, "result"), &g_work)) { if (work_decode(json_object_get(val, "result"), &g_work)) {
restart_threads();
if (!opt_quiet) if (!opt_quiet)
applog(LOG_BLUE, "%s detected new block", short_url); applog(LOG_BLUE, "%s detected new block", short_url);
g_work_time = time(NULL); g_work_time = time(NULL);
restart_threads();
} }
pthread_mutex_unlock(&g_work_lock); pthread_mutex_unlock(&g_work_lock);
json_decref(val); json_decref(val);

12
ccminer.vcxproj

@ -105,8 +105,9 @@
<CInterleavedPTX>false</CInterleavedPTX> <CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>true</Keep>
<CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_50,sm_50</CodeGeneration>
<GenerateLineInfo>true</GenerateLineInfo>
</CudaCompile> </CudaCompile>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -176,8 +177,8 @@
<CInterleavedPTX>false</CInterleavedPTX> <CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>true</Keep>
<CodeGeneration>compute_50,sm_50;</CodeGeneration> <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Defines> <Defines>
</Defines> </Defines>
@ -221,7 +222,7 @@
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
<Include> <Include>
</Include> </Include>
<TargetMachinePlatform>64</TargetMachinePlatform> <TargetMachinePlatform>64</TargetMachinePlatform>
@ -355,6 +356,7 @@
</CudaCompile> </CudaCompile>
<CudaCompile Include="cuda_nist5.cu"> <CudaCompile Include="cuda_nist5.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="zr5.cu" />
<CudaCompile Include="groestl_functions_quad.cu"> <CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild> <ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile> </CudaCompile>

9
ccminer.vcxproj.filters

@ -379,9 +379,6 @@
<CudaCompile Include="cuda_groestlcoin.cu"> <CudaCompile Include="cuda_groestlcoin.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu"> <CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
@ -571,6 +568,12 @@
<CudaCompile Include="lyra2\lyra2RE.cu"> <CudaCompile Include="lyra2\lyra2RE.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="zr5.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA\quark</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Image Include="res\ccminer.ico"> <Image Include="res\ccminer.ico">

2
configure.ac

@ -1,4 +1,4 @@
AC_INIT([ccminer], [1.6-git]) AC_INIT([ccminer], [1.6])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

30
cpuminer-config.h

@ -63,7 +63,7 @@
#define HAVE_STRING_H 1 #define HAVE_STRING_H 1
/* Define to 1 if you have the <syslog.h> header file. */ /* Define to 1 if you have the <syslog.h> header file. */
#define HAVE_SYSLOG_H 1 /* #undef HAVE_SYSLOG_H */
/* Define to 1 if you have the <sys/endian.h> header file. */ /* Define to 1 if you have the <sys/endian.h> header file. */
/* #undef HAVE_SYS_ENDIAN_H */ /* #undef HAVE_SYS_ENDIAN_H */
@ -108,7 +108,7 @@
/* #undef LIBCURL_FEATURE_SSPI */ /* #undef LIBCURL_FEATURE_SSPI */
/* Defined if libcurl supports DICT */ /* Defined if libcurl supports DICT */
#define LIBCURL_PROTOCOL_DICT 1 /* #undef LIBCURL_PROTOCOL_DICT */
/* Defined if libcurl supports FILE */ /* Defined if libcurl supports FILE */
#define LIBCURL_PROTOCOL_FILE 1 #define LIBCURL_PROTOCOL_FILE 1
@ -123,28 +123,28 @@
#define LIBCURL_PROTOCOL_HTTP 1 #define LIBCURL_PROTOCOL_HTTP 1
/* Defined if libcurl supports HTTPS */ /* Defined if libcurl supports HTTPS */
#define LIBCURL_PROTOCOL_HTTPS 1 /* #undef LIBCURL_PROTOCOL_HTTPS */
/* Defined if libcurl supports IMAP */ /* Defined if libcurl supports IMAP */
#define LIBCURL_PROTOCOL_IMAP 1 /* #undef LIBCURL_PROTOCOL_IMAP */
/* Defined if libcurl supports LDAP */ /* Defined if libcurl supports LDAP */
#define LIBCURL_PROTOCOL_LDAP 1 /* #undef LIBCURL_PROTOCOL_LDAP */
/* Defined if libcurl supports POP3 */ /* Defined if libcurl supports POP3 */
#define LIBCURL_PROTOCOL_POP3 1 /* #undef LIBCURL_PROTOCOL_POP3 */
/* Defined if libcurl supports RTSP */ /* Defined if libcurl supports RTSP */
#define LIBCURL_PROTOCOL_RTSP 1 /* #undef LIBCURL_PROTOCOL_RTSP */
/* Defined if libcurl supports SMTP */ /* Defined if libcurl supports SMTP */
#define LIBCURL_PROTOCOL_SMTP 1 /* #undef LIBCURL_PROTOCOL_SMTP */
/* Defined if libcurl supports TELNET */ /* Defined if libcurl supports TELNET */
#define LIBCURL_PROTOCOL_TELNET 1 /* #undef LIBCURL_PROTOCOL_TELNET */
/* Defined if libcurl supports TFTP */ /* Defined if libcurl supports TFTP */
#define LIBCURL_PROTOCOL_TFTP 1 /* #undef LIBCURL_PROTOCOL_TFTP */
/* Define to 1 if your C compiler doesn't accept -c and -o together. */ /* Define to 1 if your C compiler doesn't accept -c and -o together. */
/* #undef NO_MINUS_C_MINUS_O */ /* #undef NO_MINUS_C_MINUS_O */
@ -159,16 +159,16 @@
#define PACKAGE_NAME "ccminer" #define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 1.6-git" #define PACKAGE_STRING "ccminer 1.6"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "ccminer" #define PACKAGE_TARNAME "ccminer"
/* Define to the home page for this package. */ /* Define to the home page for this package. */
#define PACKAGE_URL "" #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "1.6-git" #define PACKAGE_VERSION "1.6"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be
@ -191,10 +191,10 @@
#define USE_XOP 1 #define USE_XOP 1
/* Version number of package */ /* Version number of package */
#define VERSION "1.6-git" #define VERSION "1.6"
/* Define curl_free() as free() if our version of curl lacks curl_free. */ /* Define curl_free() as free() if our version of curl lacks curl_free. */
/* #undef curl_free */ /* #undef curl_free */
/* Define to `unsigned int' if <sys/types.h> does not define. */ /* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */ //#define size_t unsigned int

5
miner.h

@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
/* api related */ /* api related */
void *api_thread(void *userdata); void *api_thread(void *userdata);
void api_set_throughput(int thr_id, uint32_t throughput); void api_set_throughput(int thr_id, uint32_t throughput);
@ -679,6 +683,7 @@ void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input); void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input); void x15hash(void *output, const void *input);
void x17hash(void *output, const void *input); void x17hash(void *output, const void *input);
void zr5hash(void *output, const void *input);
#ifdef __cplusplus #ifdef __cplusplus
} }

15
quark/cuda_quark_groestl512.cu

@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
uint32_t hash[16]; uint32_t hash[16];
from_bitslice_quad(state, hash); from_bitslice_quad(state, hash);
if (thr == 0)
{ // uint4 = 4x4 uint32_t = 16 bytes
if (thr == 0) {
uint4 *phash = (uint4*) hash;
uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
outpt[0] = phash[0];
outpt[1] = phash[1];
outpt[2] = phash[2];
outpt[3] = phash[3];
}
/*
if (thr == 0) {
#pragma unroll #pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k]; for(int k=0;k<16;k++) outpHash[k] = hash[k];
} }
*/
} }
#endif #endif
} }

39
util.cpp

@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash)
#define printpfx(n,h) \ #define printpfx(n,h) \
printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h)) printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h))
static uint32_t zrtest[20] = {
swab32(0x01806486),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x2ab03251),
swab32(0x87d4f28b),
swab32(0x6e22f086),
swab32(0x4845ddd5),
swab32(0x0ac4e6aa),
swab32(0x22a1709f),
swab32(0xfb4275d9),
swab32(0x25f26636),
swab32(0x300eed54),
swab32(0xffff0f1e),
swab32(0x2a9e2300),
};
void do_gpu_tests(void) void do_gpu_tests(void)
{ {
#ifdef _DEBUG #ifdef _DEBUG
unsigned long done; unsigned long done;
char s[128] = { '\0' }; char s[128] = { '\0' };
uchar buf[128]; uchar buf[160];
uint32_t tgt[8] = { 0 }; uint32_t tgt[8] = { 0 };
opt_tracegpu = true; opt_tracegpu = true;
@ -1674,11 +1697,15 @@ void do_gpu_tests(void)
tgt[7] = 0xffff; tgt[7] = 0xffff;
memset(buf, 0, sizeof buf); memset(buf, 0, sizeof buf);
scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done); //memcpy(buf, zrtest, 80);
scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
//memset(buf, 0, sizeof buf);
//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
memset(buf, 0, sizeof buf); memset(buf, 0, sizeof buf);
// buf[0] = 1; buf[64] = 2; // for endian tests // buf[0] = 1; buf[64] = 2; // for endian tests
scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); //scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
//memset(buf, 0, sizeof buf); //memset(buf, 0, sizeof buf);
//scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84 //scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
@ -1688,6 +1715,7 @@ void do_gpu_tests(void)
opt_tracegpu = false; opt_tracegpu = false;
#endif #endif
} }
extern "C" void zr5hash_pok(void *output, uint32_t *pdata);
void print_hash_tests(void) void print_hash_tests(void)
{ {
@ -1782,6 +1810,11 @@ void print_hash_tests(void)
x17hash(&hash[0], &buf[0]); x17hash(&hash[0], &buf[0]);
printpfx("X17", hash); printpfx("X17", hash);
//memcpy(buf, zrtest, 80);
zr5hash(&hash[0], &buf[0]);
//zr5hash_pok(&hash[0], (uint32_t*) &buf[0]);
printpfx("ZR5", hash);
printf("\n"); printf("\n");
do_gpu_tests(); do_gpu_tests();

342
zr5.cu

@ -0,0 +1,342 @@
/* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include <stdio.h>
#include <memory.h>
#define ZR_BLAKE 0
#define ZR_GROESTL 1
#define ZR_JH512 2
#define ZR_SKEIN 3
#define POK_BOOL_MASK 0x00008000
#define POK_DATA_MASK 0xFFFF0000
static uint32_t* d_hash[MAX_GPUS];
static uint16_t* d_pokh[MAX_GPUS];
static uint16_t* h_poks[MAX_GPUS];
static uint32_t* d_blake[MAX_GPUS];
static uint32_t* d_groes[MAX_GPUS];
static uint32_t* d_jh512[MAX_GPUS];
static uint32_t* d_skein[MAX_GPUS];
__constant__ uint8_t d_permut[24][4];
static const uint8_t permut[24][4] = {
{0, 1, 2, 3},
{0, 1, 3, 2},
{0, 2, 1, 3},
{0, 2, 3, 1},
{0, 3, 1, 2},
{0, 3, 2, 1},
{1, 0, 2, 3},
{1, 0, 3, 2},
{1, 2, 0, 3},
{1, 2, 3, 0},
{1, 3, 0, 2},
{1, 3, 2, 0},
{2, 0, 1, 3},
{2, 0, 3, 1},
{2, 1, 0, 3},
{2, 1, 3, 0},
{2, 3, 0, 1},
{2, 3, 1, 0},
{3, 0, 1, 2},
{3, 0, 2, 1},
{3, 1, 0, 2},
{3, 1, 2, 0},
{3, 2, 0, 1},
{3, 2, 1, 0}
};
// CPU HASH
extern "C" void zr5hash(void *output, const void *input)
{
sph_keccak512_context ctx_keccak;
sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_skein512_context ctx_skein;
uchar _ALIGN(64) hash[64];
uint32_t *phash = (uint32_t *) hash;
uint32_t norder;
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, (const void*) input, 80);
sph_keccak512_close(&ctx_keccak, (void*) phash);
norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */
for(int i = 0; i < 4; i++)
{
switch (permut[norder][i]) {
case ZR_BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, (const void*) phash, 64);
sph_blake512_close(&ctx_blake, phash);
break;
case ZR_GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, (const void*) phash, 64);
sph_groestl512_close(&ctx_groestl, phash);
break;
case ZR_JH512:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*) phash, 64);
sph_jh512_close(&ctx_jh, phash);
break;
case ZR_SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*) phash, 64);
sph_skein512_close(&ctx_skein, phash);
break;
default:
break;
}
}
memcpy(output, phash, 32);
}
extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
{
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
uint32_t _ALIGN(64) hash[8];
pdata[0] = version;
zr5hash(hash, pdata);
// fill PoK
pdata[0] = version | (hash[0] & POK_DATA_MASK);
zr5hash(hash, pdata);
memcpy(output, hash, 32);
}
__global__
void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd)
{
// copy 64 bytes hash in the right algo buffer
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint64_t offset = thread * 64 / 4;
uint32_t *phash = &d_hash[offset];
// algos hash order
uint32_t norder = phash[0] % ARRAY_SIZE(permut);
uint32_t algo = d_permut[norder][rnd];
uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
if (rnd > 0) {
int algosrc = d_permut[norder][rnd - 1];
phash = buffers[algosrc] + offset;
}
// uint4 = 4x4 uint32_t = 16 bytes
uint4 *psrc = (uint4*) phash;
uint4 *pdst = (uint4*) (buffers[algo] + offset);
pdst[0] = psrc[0];
pdst[1] = psrc[1];
pdst[2] = psrc[2];
pdst[3] = psrc[3];
}
}
__host__
void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
zr5_copy_round_data_gpu <<<grid, block>>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd);
}
__global__
void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh)
{
// after the 4 algos rounds, copy back hash to d_hash
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint64_t offset = thread * 16; // 64 / 4;
uint32_t *phash = &d_hash[offset];
uint16_t norder = phash[0] % ARRAY_SIZE(permut);
uint16_t algosrc = d_permut[norder][3];
uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
// copy only hash[0] + hash[6..7]
uint2 *psrc = (uint2*) (buffers[algosrc] + offset);
uint2 *pdst = (uint2*) phash;
pdst[0].x = psrc[0].x;
pdst[3] = psrc[3];
//phash[7] = *(buffers[algosrc] + offset + 7);
}
}
__host__
void zr5_final_round(int thr_id, uint32_t threads)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
zr5_final_round_data_gpu <<<grid, block>>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]);
}
extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks);
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) tmpdata[20];
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
const uint32_t first_nonce = pdata[19];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 18);
throughput = min(throughput, (1U << 20)-1024);
throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
memcpy(tmpdata, pdata, 80);
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// hash buffer = keccak hash 64 required
cudaMalloc(&d_hash[thr_id], 64 * throughput);
cudaMalloc(&d_pokh[thr_id], 2 * throughput);
cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
cudaMallocHost(&h_poks[thr_id], 2 * throughput);
// data buffers for the 4 rounds
cudaMalloc(&d_blake[thr_id], 64 * throughput);
cudaMalloc(&d_groes[thr_id], 64 * throughput);
cudaMalloc(&d_jh512[thr_id], 64 * throughput);
cudaMalloc(&d_skein[thr_id], 64 * throughput);
jackpot_keccak512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
init[thr_id] = true;
}
tmpdata[0] = version;
jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
// Keccak512 Hash with CUDA
zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
for (int rnd=0; rnd<4; rnd++) {
zr5_move_data_to_hash(thr_id, throughput, rnd);
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
}
// This generates all pok prefixes
zr5_final_round(thr_id, throughput);
// Keccak512 pok
zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]);
for (int rnd=0; rnd<4; rnd++) {
zr5_move_data_to_hash(thr_id, throughput, rnd);
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
}
zr5_final_round(thr_id, throughput);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX)
{
uint32_t vhash64[8];
uint32_t oldp0 = pdata[0];
uint32_t oldp19 = pdata[19];
uint32_t offset = foundNonce - pdata[19];
uint32_t pok = 0;
*hashes_done = pdata[19] - first_nonce + throughput;
cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost);
pok = version | (0x10000UL * h_poks[thr_id][offset]);
pdata[0] = pok; pdata[19] = foundNonce;
zr5hash(vhash64, pdata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
int res = 1;
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
if (secNonce != 0) {
offset = secNonce - oldp19;
pok = version | (0x10000UL * h_poks[thr_id][offset]);
memcpy(tmpdata, pdata, 80);
tmpdata[0] = pok; tmpdata[19] = secNonce;
zr5hash(vhash64, tmpdata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
pdata[21] = secNonce;
pdata[22] = pok;
res++;
}
}
return res;
} else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
pdata[19]++;
pdata[0] = oldp0;
}
} else
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save