From ebd23bcc664f03a806bacf0c316b6c5220e3f3f5 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 8 Mar 2015 22:09:09 +0100 Subject: [PATCH] whirlpoolx: real fix for multi gpus Main problem was the arrays allocations which should be made per cpu Signed-off-by: Tanguy Pruvot --- x11/x11.cu | 2 +- x15/cuda_whirlpoolx.cu | 40 +++++++++++++++++++++------------------- x15/whirlpoolx.cu | 25 ++++++++++++------------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/x11/x11.cu b/x11/x11.cu index 61acb7d..3efc48b 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -133,7 +133,7 @@ extern "C" void x11hash(void *output, const void *input) cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ printf("%s %08x %08x %08x %08x...\n", algo, htobe32(debugbuf[0]), htobe32(debugbuf[1]), \ htobe32(debugbuf[2]), htobe32(debugbuf[3])); \ - cudaFree(debugbuf); \ + cudaFreeHost(debugbuf); \ } \ } #else diff --git a/x15/cuda_whirlpoolx.cu b/x15/cuda_whirlpoolx.cu index efe1a57..f772a7f 100644 --- a/x15/cuda_whirlpoolx.cu +++ b/x15/cuda_whirlpoolx.cu @@ -14,8 +14,8 @@ __constant__ uint64_t c_xtra[8]; __constant__ uint64_t c_tmp[72]; __constant__ uint64_t pTarget[4]; -uint32_t *d_wxnounce[MAX_GPUS]; -uint32_t *d_WXNonce[MAX_GPUS]; +static uint32_t *h_wxnounce[MAX_GPUS] = { 0 }; +static uint32_t *d_WXNonce[MAX_GPUS] = { 0 }; /** * Whirlpool CUDA kernel implementation. @@ -151,8 +151,8 @@ static uint64_t ROUND_ELT(const uint64_t* sharedMemory, const uint64_t* __restri ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \ TRANSFER(in, out) -uint64_t* d_xtra; -uint64_t* d_tmp; +static uint64_t* d_xtra[MAX_GPUS] = { 0 }; +static uint64_t* d_tmp[MAX_GPUS] = { 0 }; __device__ __forceinline__ static void whirlpoolx_getShared(uint64_t* sharedMemory) @@ -172,12 +172,12 @@ static void whirlpoolx_getShared(uint64_t* sharedMemory) __global__ -void whirlpoolx_gpu_precompute(int threads, uint64_t* d_xtra, uint64_t* d_tmp) +void whirlpoolx_gpu_precompute(uint32_t threads, uint64_t* d_xtra, uint64_t* d_tmp) { __shared__ uint64_t sharedMemory[2048]; whirlpoolx_getShared(sharedMemory); - int thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint64_t n[8]; @@ -534,14 +534,14 @@ void whirlpoolx_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *resNo } __host__ -extern void whirlpoolx_cpu_init(int thr_id, int threads) +extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads) { cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); cudaMalloc(&d_WXNonce[thr_id], sizeof(uint32_t)); - cudaMallocHost(&d_wxnounce[thr_id], sizeof(uint32_t)); - cudaMalloc((void **)&d_xtra, 8 * sizeof(uint64_t)); - CUDA_SAFE_CALL(cudaMalloc((void **)&d_tmp, 8 * 9 * sizeof(uint64_t))); + cudaMallocHost(&h_wxnounce[thr_id], sizeof(uint32_t)); + cudaMalloc(&d_xtra[thr_id], 8 * sizeof(uint64_t)); + CUDA_SAFE_CALL(cudaMalloc(&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t))); // d_tmp[threadIdx.x+64] (7+64) } __host__ @@ -550,21 +550,22 @@ void whirlpoolx_setBlock_80(void *pdata, const void *ptarget) uint64_t PaddedMessage[16]; memcpy(PaddedMessage, pdata, 80); memset((uint8_t*)&PaddedMessage+80, 0, 48); - *(uint8_t*)(&PaddedMessage+80) = 0x80; /* ending */ + ((uint8_t*)PaddedMessage)[80] = 0x80; /* ending */ cudaMemcpyToSymbol(pTarget, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); } __host__ -void whirlpoolx_precompute() +void whirlpoolx_precompute(int thr_id) { dim3 grid(1); dim3 block(256); - whirlpoolx_gpu_precompute <<>>(8, &d_xtra[0], &d_tmp[0]); + whirlpoolx_gpu_precompute <<>>(8, d_xtra[thr_id], d_tmp[thr_id]); cudaThreadSynchronize(); - cudaMemcpyToSymbol(c_xtra, d_xtra, 8 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice); - cudaMemcpyToSymbol(c_tmp, d_tmp, 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice); + + cudaMemcpyToSymbol(c_xtra, d_xtra[thr_id], 8 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_tmp, d_tmp[thr_id], 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice)); } __host__ @@ -574,10 +575,11 @@ uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce) dim3 block(threadsPerBlock); cudaMemset(d_WXNonce[thr_id], 0xff, sizeof(uint32_t)); - whirlpoolx_gpu_hash<<>>(threads, startNounce,d_WXNonce[thr_id]); + whirlpoolx_gpu_hash<<>>(threads, startNounce, d_WXNonce[thr_id]); cudaThreadSynchronize(); - cudaMemcpy(d_wxnounce[thr_id], d_WXNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - return *d_wxnounce[thr_id]; + cudaMemcpy(h_wxnounce[thr_id], d_WXNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + return *(h_wxnounce[thr_id]); } diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu index 5de1082..6a8322e 100644 --- a/x15/whirlpoolx.cu +++ b/x15/whirlpoolx.cu @@ -1,20 +1,20 @@ /* * whirlpool routine (djm) - * whirlpoolx routine (provos alexis) + * whirlpoolx routine (provos alexis, tpruvot) */ extern "C" { #include "sph/sph_whirlpool.h" -#include "miner.h" } +#include "miner.h" #include "cuda_helper.h" static uint32_t *d_hash[MAX_GPUS]; -extern void whirlpoolx_cpu_init(int thr_id, int threads); +extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads); extern void whirlpoolx_setBlock_80(void *pdata, const void *ptarget); extern uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); -extern void whirlpoolx_precompute(); +extern void whirlpoolx_precompute(int thr_id); // CPU Hash function extern "C" void whirlxHash(void *state, const void *input) @@ -24,28 +24,26 @@ extern "C" void whirlxHash(void *state, const void *input) unsigned char hash[64]; unsigned char hash_xored[32]; - memset(hash, 0, sizeof hash); - sph_whirlpool_init(&ctx_whirlpool); sph_whirlpool(&ctx_whirlpool, input, 80); sph_whirlpool_close(&ctx_whirlpool, hash); - - for (uint32_t i = 0; i < 32; i++){ - hash_xored[i] = hash[i] ^ hash[i + 16]; + // compress the 48 first bytes of the hash to 32 + for (int i = 0; i < 32; i++) { + hash_xored[i] = hash[i] ^ hash[i + 16]; } memcpy(state, hash_xored, 32); } static bool init[MAX_GPUS] = { 0 }; - extern "C" int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); + int intensity = is_windows() ? 18 : 22; + uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) @@ -54,7 +52,8 @@ extern "C" int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, const uint32_t * if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - cudaMalloc(&d_hash[thr_id], 64 * throughput); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); + whirlpoolx_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -65,7 +64,7 @@ extern "C" int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, const uint32_t * } whirlpoolx_setBlock_80((void*)endiandata, ptarget); - whirlpoolx_precompute(); + whirlpoolx_precompute(thr_id); do { uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]); if (foundNonce != UINT32_MAX)