From 2907a7c6786fe576f6447789c0876bf09d741ce6 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 12 Jun 2015 05:13:14 +0200 Subject: [PATCH] zr5: add support for pok mining (getwork) I dont store txs on biggest transactions to reduce memory usage. In this case, the pok bool is not enabled for the bloc... Signed-off-by: Tanguy Pruvot --- JHA/cuda_jha_keccak512.cu | 6 ++-- ccminer.cpp | 40 +++++++++++++++++++--- miner.h | 21 ++++++++++-- zr5.cu | 70 ++++++++++++++++++++++++++++++++------- 4 files changed, 114 insertions(+), 23 deletions(-) diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index 83d5757..9f91568 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -2,6 +2,7 @@ #include #include "cuda_helper.h" +#include "miner.h" // ZR5 __constant__ uint32_t d_OriginalData[20]; @@ -9,9 +10,6 @@ __constant__ uint32_t d_OriginalData[20]; __constant__ uint32_t c_PaddedMessage[18]; __constant__ uint64_t c_State[25]; -#define POK_DATA_MASK 0xFFFF0000 -#define POK_VERSION 0x1 - #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) @@ -657,7 +655,7 @@ __host__ void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks) { const uint32_t threadsperblock = 256; - const uint32_t version = pdata[0] & (~POK_DATA_MASK); + const uint32_t version = (pdata[0] & (~POK_DATA_MASK)) | (usepok ? POK_BOOL_MASK : 0); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); diff --git a/ccminer.cpp b/ccminer.cpp index d4a85f1..97ebfb8 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -246,7 +246,7 @@ bool stratum_need_reset = false; volatile bool abort_flag = false; struct work_restart *work_restart = NULL; static int app_exit_code = EXIT_CODE_OK; -uint32_t zr5_pok = 0; +int usepok = 0; pthread_mutex_t applog_lock; static pthread_mutex_t stats_lock; @@ -684,6 +684,37 @@ static bool work_decode(const json_t *val, struct work *work) if (opt_max_diff > 0. && !allow_mininginfo) calc_network_diff(work); + + work->tx_count = usepok = 0; + if (work->data[0] & POK_BOOL_MASK) { + usepok = 1; + json_t *txs = json_object_get(val, "txs"); + if (txs && json_is_array(txs)) { + size_t idx, totlen = 0; + json_t *p; + + json_array_foreach(txs, idx, p) { + const int tx = work->tx_count % POK_MAX_TXS; + const char* hexstr = json_string_value(p); + size_t txlen = strlen(hexstr)/2; + work->tx_count++; + if (work->tx_count > POK_MAX_TXS || txlen >= POK_MAX_TX_SZ) { + // when tx is too big, just reset usepok for the bloc + usepok = 0; + applog(LOG_WARNING, "large bloc ignored, txs > %d, len: %u", + POK_MAX_TXS, txlen); + work->tx_count = 0; + break; + } + hex2bin((uchar*)work->txs[tx].data, hexstr, min(txlen, POK_MAX_TX_SZ)); + work->txs[tx].len = txlen; + totlen += txlen; + } + if (opt_debug) + applog(LOG_DEBUG, "bloc txs: %u, total len: %u", work->tx_count, totlen); + } + } + json_t *jr = json_object_get(val, "noncerange"); if (jr) { const char * hexstr = json_string_value(jr); @@ -1943,11 +1974,10 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; - case ALGO_ZR5: { - rc = scanhash_zr5(thr_id, work.data, work.target, - max_nonce, &hashes_done); + case ALGO_ZR5: + rc = scanhash_zr5(thr_id, &work, max_nonce, &hashes_done); break; - } + default: /* should never happen */ goto out; diff --git a/miner.h b/miner.h index b349669..b2e225e 100644 --- a/miner.h +++ b/miner.h @@ -258,6 +258,8 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); #define HAVE_SHA256_4WAY 0 #define HAVE_SHA256_8WAY 0 +struct work; + extern int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -372,8 +374,7 @@ extern int scanhash_x17(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_zr5(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, +extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); /* api related */ @@ -485,6 +486,7 @@ extern char *opt_proxy; extern long opt_proxy_type; extern bool use_syslog; extern bool use_colors; +extern int usepok; extern pthread_mutex_t applog_lock; extern struct thr_info *thr_info; extern int longpoll_thr_id; @@ -597,6 +599,13 @@ struct stratum_ctx { int srvtime_diff; }; +#define POK_MAX_TXS 8 +#define POK_MAX_TX_SZ 8192U +struct tx { + uint8_t data[POK_MAX_TX_SZ]; + uint32_t len; +}; + struct work { uint32_t data[32]; uint32_t target[8]; @@ -617,8 +626,16 @@ struct work { uint32_t scanned_from; uint32_t scanned_to; + + /* pok getwork txs */ + uint32_t tx_count; + struct tx txs[POK_MAX_TXS]; }; +#define POK_BOOL_MASK 0x00008000 +#define POK_DATA_MASK 0xFFFF0000 + + #define MAX_POOLS 8 struct pool_infos { uint8_t id; diff --git a/zr5.cu b/zr5.cu index c3c3c9c..e472add 100644 --- a/zr5.cu +++ b/zr5.cu @@ -33,6 +33,9 @@ static uint32_t* d_groes[MAX_GPUS]; static uint32_t* d_jh512[MAX_GPUS]; static uint32_t* d_skein[MAX_GPUS]; +static uint8_t* d_txs[MAX_GPUS]; +__constant__ uint16_t c_txlens[POK_MAX_TXS]; + __constant__ uint8_t c_permut[24][4]; static const uint8_t permut[24][4] = { {0, 1, 2, 3}, @@ -112,8 +115,8 @@ extern "C" void zr5hash(void *output, const void *input) extern "C" void zr5hash_pok(void *output, uint32_t *pdata) { - const uint32_t version = pdata[0] & (~POK_DATA_MASK); uint32_t _ALIGN(64) hash[8]; + const uint32_t version = (pdata[0] & (~POK_DATA_MASK)) | (usepok ? POK_BOOL_MASK : 0); pdata[0] = version; zr5hash(hash, pdata); @@ -208,7 +211,7 @@ void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd) __global__ __launch_bounds__(128, 8) -void zr5_get_poks_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint32_t *d_hash, uint16_t *d_poks) +void zr5_get_poks_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint16_t *d_poks) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -223,14 +226,54 @@ void zr5_get_poks_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* con } } +__global__ __launch_bounds__(128, 6) +void zr5_get_poks_xor_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* d_permut, uint16_t* d_poks, uint8_t* d_txs, uint8_t txs) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint8_t norder = d_permut[thread]; + const uint8_t algo = c_permut[norder][3]; + const uint8_t ntx = norder % txs; // generally 0 on testnet... + const uint32_t offset = thread * 16U; // 64 / 4; + uint32_t* hash = (uint32_t*) (d_buffers[algo] + offset); + uint32_t randNdx = hash[1] % c_txlens[ntx]; + uint8_t* ptx = &d_txs[POK_MAX_TX_SZ*ntx] + randNdx; + uint32_t x = 0x100UL * ptx[3] + ptx[2]; + + d_poks[thread] = x ^ (hash[2] >> 16); + } +} + __host__ -void zr5_get_poks(int thr_id, uint32_t threads, uint16_t* d_poks) +void zr5_get_poks(int thr_id, uint32_t threads, uint16_t* d_poks, struct work* work) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - zr5_get_poks_gpu <<>> (threads, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id], d_poks); + uint8_t txs = (uint8_t) work->tx_count; + + if (txs && usepok) + { + uint32_t txlens[POK_MAX_TXS]; + uint8_t* txdata = (uint8_t*) calloc(POK_MAX_TXS, POK_MAX_TX_SZ); + if (!txdata) { + applog(LOG_ERR, "%s: error, memory alloc failure", __func__); + return; + } + // create blocs to copy on device + for (uint8_t tx=0; tx < txs; tx++) { + txlens[tx] = (uint32_t) (work->txs[tx].len - 3U); + memcpy(&txdata[POK_MAX_TX_SZ*tx], work->txs[tx].data, min(POK_MAX_TX_SZ, txlens[tx]+3U)); + } + cudaMemcpy(d_txs[thr_id], txdata, txs * POK_MAX_TX_SZ, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_txlens, txlens, txs * sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + zr5_get_poks_xor_gpu <<>> (threads, d_buffers[thr_id], d_permut[thr_id], d_poks, d_txs[thr_id], txs); + free(txdata); + } else { + zr5_get_poks_gpu <<>> (threads, d_buffers[thr_id], d_permut[thr_id], d_poks); + } } @@ -285,18 +328,21 @@ extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, +extern "C" int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t _ALIGN(64) tmpdata[20]; - const uint32_t version = pdata[0] & (~POK_DATA_MASK); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t oldp0 = pdata[0]; + const uint32_t version = (oldp0 & (~POK_DATA_MASK)) | (usepok ? POK_BOOL_MASK : 0); const uint32_t first_nonce = pdata[19]; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 18); throughput = min(throughput, (1U << 20)-1024); throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x0000ff; memcpy(tmpdata, pdata, 80); @@ -319,6 +365,8 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget cudaMalloc(&d_jh512[thr_id], 64 * throughput); cudaMalloc(&d_skein[thr_id], 64 * throughput); + cudaMalloc(&d_txs[thr_id], POK_MAX_TXS * POK_MAX_TX_SZ); + jackpot_keccak512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput); @@ -354,7 +402,7 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget } // store on device d_poks all hash[0] prefixes - zr5_get_poks(thr_id, throughput, d_poks[thr_id]); + zr5_get_poks(thr_id, throughput, d_poks[thr_id], work); // Keccak512 with pok zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_poks[thr_id]); @@ -374,7 +422,6 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget if (foundNonce != UINT32_MAX) { uint32_t vhash64[8]; - uint32_t oldp0 = pdata[0]; uint32_t oldp19 = pdata[19]; uint32_t offset = foundNonce - pdata[19]; uint32_t pok = 0; @@ -406,9 +453,6 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget } else { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); - // reinit the card.. segfault so no - // cuda_reset_device(thr_id, init); - pdata[19]++; pdata[0] = oldp0; } @@ -417,6 +461,8 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + pdata[0] = oldp0; + *hashes_done = pdata[19] - first_nonce + 1; return 0; }