From 374174c7c8db268e8264f4b5adde639d3e92c4bd Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 5 Jul 2016 11:06:58 +0200 Subject: [PATCH] groestl: same, remove useless host alloc --- cuda_groestlcoin.cu | 34 +++++++++++++++------------------- cuda_groestlcoin.h | 2 +- cuda_myriadgroestl.cu | 16 ++++++++++------ groestlcoin.cpp | 9 ++------- 4 files changed, 28 insertions(+), 33 deletions(-) diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index eb4f833..5c8fcf9 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -5,6 +5,11 @@ #include "cuda_helper.h" +#ifdef __INTELLISENSE__ +#define __CUDA_ARCH__ 500 +#define __byte_perm(x,y,n) x +#endif + #include "miner.h" __constant__ uint32_t pTarget[8]; // Single GPU @@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t } } - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; + if(rc && resNounce[0] > nounce) + resNounce[0] = nounce; } } #endif @@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id) __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { - uint32_t msgBlock[32]; + uint32_t msgBlock[32] = { 0 }; - memset(msgBlock, 0, sizeof(uint32_t) * 32); memcpy(&msgBlock[0], data, 80); // Erweitere die Nachricht auf den Nachrichtenblock (padding) @@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) // auf der GPU ausgeführt) // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestlcoin_gpu_msg, - msgBlock, - 128); + cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); + cudaMemcpyToSymbol(pTarget, pTargetIn, 32); } __host__ -void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce) { uint32_t threadsperblock = 256; @@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl int factor = 4; - // berechne wie viele Thread Blocks wir brauchen + // berechne wie viele Thread Blocks wir brauchen dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - int dev_id = device_map[thr_id]; if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)"); @@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo } cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - groestlcoin_gpu_hash_quad<<>>(threads, startNounce, d_resultNonce[thr_id]); + groestlcoin_gpu_hash_quad <<>> (threads, startNounce, d_resultNonce[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); + // MyStreamSynchronize(NULL, 0, thr_id); - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); } diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h index d4fa42e..e38f086 100644 --- a/cuda_groestlcoin.h +++ b/cuda_groestlcoin.h @@ -4,6 +4,6 @@ void groestlcoin_cpu_init(int thr_id, uint32_t threads); void groestlcoin_cpu_free(int thr_id); void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); -void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); +void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce); #endif \ No newline at end of file diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index 7598228..c8f123f 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -18,7 +18,7 @@ // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU -uint32_t *d_outputHashes[MAX_GPUS]; +static uint32_t *d_outputHashes[MAX_GPUS]; static uint32_t *d_resultNonce[MAX_GPUS]; __constant__ uint32_t myriadgroestl_gpu_msg[32]; @@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message) message[k] = SWAB32(hash[k]); } -__global__ void __launch_bounds__(256, 4) - myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer) +__global__ +__launch_bounds__(256, 4) +void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer) { #if __CUDA_ARCH__ >= 300 // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen @@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4) #endif } -__global__ void - myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) +__global__ +void myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) { #if __CUDA_ARCH__ >= 300 uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads) myr_sha256_cpu_constantTable, sizeof(uint32_t) * 64 ); + // to check if the binary supports SM3+ + cuda_get_arch(thr_id); + cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } @@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); + //MyStreamSynchronize(NULL, 0, thr_id); cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); } diff --git a/groestlcoin.cpp b/groestlcoin.cpp index 1b0ec4e..96a6d38 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8 if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce); - uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput); - if (opt_benchmark) ptarget[7] = 0x001f; @@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi *hashes_done = pdata[19] - start_nonce + throughput; // GPU hash - groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); + groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &foundNounce); if (foundNounce < UINT32_MAX && bench_algo < 0) { @@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { work_set_target_ratio(work, vhash); pdata[19] = foundNounce; - free(outputHash); return true; - } else { + } else if (vhash[7] > ptarget[7]) { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce); } } @@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - start_nonce; - - free(outputHash); return 0; }