Browse Source

groestl: same, remove useless host alloc

2upstream
Tanguy Pruvot 9 years ago
parent
commit
374174c7c8
  1. 34
      cuda_groestlcoin.cu
  2. 2
      cuda_groestlcoin.h
  3. 16
      cuda_myriadgroestl.cu
  4. 9
      groestlcoin.cpp

34
cuda_groestlcoin.cu

@ -5,6 +5,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
#ifdef __INTELLISENSE__
#define __CUDA_ARCH__ 500
#define __byte_perm(x,y,n) x
#endif
#include "miner.h" #include "miner.h"
__constant__ uint32_t pTarget[8]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU
@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
} }
} }
if(rc == true) if(rc && resNounce[0] > nounce)
if(resNounce[0] > nounce) resNounce[0] = nounce;
resNounce[0] = nounce;
} }
} }
#endif #endif
@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id)
__host__ __host__
void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{ {
uint32_t msgBlock[32]; uint32_t msgBlock[32] = { 0 };
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 80); memcpy(&msgBlock[0], data, 80);
// Erweitere die Nachricht auf den Nachrichtenblock (padding) // Erweitere die Nachricht auf den Nachrichtenblock (padding)
@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
// auf der GPU ausgeführt) // auf der GPU ausgeführt)
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
cudaMemcpyToSymbol( groestlcoin_gpu_msg, cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128);
msgBlock,
128);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
cudaMemcpyToSymbol( pTarget, cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
pTargetIn,
sizeof(uint32_t) * 8 );
} }
__host__ __host__
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce)
{ {
uint32_t threadsperblock = 256; uint32_t threadsperblock = 256;
@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
int factor = 4; int factor = 4;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)"); gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
} }
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]); groestlcoin_gpu_hash_quad <<<grid, block>>> (threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); // MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
} }

2
cuda_groestlcoin.h

@ -4,6 +4,6 @@
void groestlcoin_cpu_init(int thr_id, uint32_t threads); void groestlcoin_cpu_init(int thr_id, uint32_t threads);
void groestlcoin_cpu_free(int thr_id); void groestlcoin_cpu_free(int thr_id);
void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce);
#endif #endif

16
cuda_myriadgroestl.cu

@ -18,7 +18,7 @@
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[MAX_GPUS]; static uint32_t *d_outputHashes[MAX_GPUS];
static uint32_t *d_resultNonce[MAX_GPUS]; static uint32_t *d_resultNonce[MAX_GPUS];
__constant__ uint32_t myriadgroestl_gpu_msg[32]; __constant__ uint32_t myriadgroestl_gpu_msg[32];
@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
message[k] = SWAB32(hash[k]); message[k] = SWAB32(hash[k]);
} }
__global__ void __launch_bounds__(256, 4) __global__
myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer) __launch_bounds__(256, 4)
void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4)
#endif #endif
} }
__global__ void __global__
myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) void myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
myr_sha256_cpu_constantTable, myr_sha256_cpu_constantTable,
sizeof(uint32_t) * 64 ); sizeof(uint32_t) * 64 );
// to check if the binary supports SM3+
cuda_get_arch(thr_id);
cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads); cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads);
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
} }
@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]); myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); //MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
} }

9
groestlcoin.cpp

@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8 uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
if (opt_benchmark) if (opt_benchmark)
ptarget[7] = 0x001f; ptarget[7] = 0x001f;
@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
*hashes_done = pdata[19] - start_nonce + throughput; *hashes_done = pdata[19] - start_nonce + throughput;
// GPU hash // GPU hash
groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &foundNounce);
if (foundNounce < UINT32_MAX && bench_algo < 0) if (foundNounce < UINT32_MAX && bench_algo < 0)
{ {
@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
pdata[19] = foundNounce; pdata[19] = foundNounce;
free(outputHash);
return true; return true;
} else { } else if (vhash[7] > ptarget[7]) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
} }
} }
@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
} while (!work_restart[thr_id].restart); } while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - start_nonce; *hashes_done = pdata[19] - start_nonce;
free(outputHash);
return 0; return 0;
} }

Loading…
Cancel
Save