groestl: same, remove useless host alloc

2025-03-13 05:51:19 +00:00 · 2016-07-05 11:06:58 +02:00 · 2016-07-05 11:06:58 +02:00 · 374174c7c8
commit 374174c7c8
parent 3fe4dda4c1
4 changed files with 28 additions and 33 deletions
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -5,6 +5,11 @@

 #include "cuda_helper.h"

+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x,y,n) x
+#endif
+
 #include "miner.h"

 __constant__ uint32_t pTarget[8]; // Single GPU
@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
 				 }
 			}

-			if(rc == true)
-				if(resNounce[0] > nounce)
-					resNounce[0] = nounce;
+			if(rc && resNounce[0] > nounce)
+				resNounce[0] = nounce;
 		}
 	}
 #endif
@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id)
 __host__
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	uint32_t msgBlock[32];
+	uint32_t msgBlock[32] = { 0 };

-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
 	memcpy(&msgBlock[0], data, 80);

 	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 	// auf der GPU ausgeführt)

 	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-						msgBlock,
-						128);
+	cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128);

 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	cudaMemcpyToSymbol( pTarget,
-						pTargetIn,
-						sizeof(uint32_t) * 8 );
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 }

 __host__
-void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce)
 {
 	uint32_t threadsperblock = 256;

@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
 	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
 	int factor = 4;

-		// berechne wie viele Thread Blocks wir brauchen
+	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
 	dim3 block(threadsperblock);

-	// Größe des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 		gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
 	}

 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+	groestlcoin_gpu_hash_quad <<<grid, block>>> (threads, startNounce, d_resultNonce[thr_id]);

 	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, 0, thr_id);
+	// MyStreamSynchronize(NULL, 0, thr_id);

-	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@ -4,6 +4,6 @@
 void groestlcoin_cpu_init(int thr_id, uint32_t threads);
 void groestlcoin_cpu_free(int thr_id);
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce);

 #endif
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -18,7 +18,7 @@

 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-uint32_t *d_outputHashes[MAX_GPUS];
+static uint32_t *d_outputHashes[MAX_GPUS];
 static uint32_t *d_resultNonce[MAX_GPUS];

 __constant__ uint32_t myriadgroestl_gpu_msg[32];
@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
 		message[k] = SWAB32(hash[k]);
 }

-__global__ void __launch_bounds__(256, 4)
- myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
+__global__
+__launch_bounds__(256, 4)
+void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
 #if __CUDA_ARCH__ >= 300
 	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4)
 #endif
 }

-__global__ void
- myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
+__global__
+void myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
 {
 #if __CUDA_ARCH__ >= 300
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
 						myr_sha256_cpu_constantTable,
 						sizeof(uint32_t) * 64 );

+	// to check if the binary supports SM3+
+	cuda_get_arch(thr_id);
+
 	cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
 	myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);

 	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);

 	cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
 	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);

-	uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
-
 	if (opt_benchmark)
 		ptarget[7] = 0x001f;

@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		*hashes_done = pdata[19] - start_nonce + throughput;

 		// GPU hash
-		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
+		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &foundNounce);

 		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				work_set_target_ratio(work, vhash);
 				pdata[19] = foundNounce;
-				free(outputHash);
 				return true;
-			} else {
+			} else if (vhash[7] > ptarget[7]) {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	} while (!work_restart[thr_id].restart);

 	*hashes_done = pdata[19] - start_nonce;
-
-	free(outputHash);
 	return 0;
 }