Browse Source

groestl: same, remove useless host alloc

2upstream
Tanguy Pruvot 8 years ago
parent
commit
374174c7c8
  1. 34
      cuda_groestlcoin.cu
  2. 2
      cuda_groestlcoin.h
  3. 16
      cuda_myriadgroestl.cu
  4. 9
      groestlcoin.cpp

34
cuda_groestlcoin.cu

@ -5,6 +5,11 @@ @@ -5,6 +5,11 @@
#include "cuda_helper.h"
#ifdef __INTELLISENSE__
#define __CUDA_ARCH__ 500
#define __byte_perm(x,y,n) x
#endif
#include "miner.h"
__constant__ uint32_t pTarget[8]; // Single GPU
@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t @@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
}
}
if(rc == true)
if(resNounce[0] > nounce)
resNounce[0] = nounce;
if(rc && resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
#endif
@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id) @@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id)
__host__
void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{
uint32_t msgBlock[32];
uint32_t msgBlock[32] = { 0 };
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 80);
// Erweitere die Nachricht auf den Nachrichtenblock (padding)
@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) @@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
// auf der GPU ausgeführt)
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
cudaMemcpyToSymbol( groestlcoin_gpu_msg,
msgBlock,
128);
cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
cudaMemcpyToSymbol( pTarget,
pTargetIn,
sizeof(uint32_t) * 8 );
cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
}
__host__
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce)
{
uint32_t threadsperblock = 256;
@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo @@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
int factor = 4;
// berechne wie viele Thread Blocks wir brauchen
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
int dev_id = device_map[thr_id];
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo @@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
}
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
groestlcoin_gpu_hash_quad <<<grid, block>>> (threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
// MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

2
cuda_groestlcoin.h

@ -4,6 +4,6 @@ @@ -4,6 +4,6 @@
void groestlcoin_cpu_init(int thr_id, uint32_t threads);
void groestlcoin_cpu_free(int thr_id);
void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce);
#endif

16
cuda_myriadgroestl.cu

@ -18,7 +18,7 @@ @@ -18,7 +18,7 @@
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[MAX_GPUS];
static uint32_t *d_outputHashes[MAX_GPUS];
static uint32_t *d_resultNonce[MAX_GPUS];
__constant__ uint32_t myriadgroestl_gpu_msg[32];
@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message) @@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
message[k] = SWAB32(hash[k]);
}
__global__ void __launch_bounds__(256, 4)
myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
__global__
__launch_bounds__(256, 4)
void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
{
#if __CUDA_ARCH__ >= 300
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4) @@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4)
#endif
}
__global__ void
myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
__global__
void myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
{
#if __CUDA_ARCH__ >= 300
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads) @@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
myr_sha256_cpu_constantTable,
sizeof(uint32_t) * 64 );
// to check if the binary supports SM3+
cuda_get_arch(thr_id);
cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads);
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}
@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, @@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
//MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

9
groestlcoin.cpp

@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi @@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
if (opt_benchmark)
ptarget[7] = 0x001f;
@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi @@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
*hashes_done = pdata[19] - start_nonce + throughput;
// GPU hash
groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &foundNounce);
if (foundNounce < UINT32_MAX && bench_algo < 0)
{
@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi @@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash);
pdata[19] = foundNounce;
free(outputHash);
return true;
} else {
} else if (vhash[7] > ptarget[7]) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
}
}
@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi @@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - start_nonce;
free(outputHash);
return 0;
}

Loading…
Cancel
Save