From 374174c7c8db268e8264f4b5adde639d3e92c4bd Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 5 Jul 2016 11:06:58 +0200
Subject: [PATCH] groestl: same, remove useless host alloc

---
 cuda_groestlcoin.cu   | 34 +++++++++++++++-------------------
 cuda_groestlcoin.h    |  2 +-
 cuda_myriadgroestl.cu | 16 ++++++++++------
 groestlcoin.cpp       |  9 ++-------
 4 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index eb4f833..5c8fcf9 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -5,6 +5,11 @@
 
 #include "cuda_helper.h"
 
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x,y,n) x
+#endif
+
 #include "miner.h"
 
 __constant__ uint32_t pTarget[8]; // Single GPU
@@ -85,9 +90,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
 				 }
 			}
 
-			if(rc == true)
-				if(resNounce[0] > nounce)
-					resNounce[0] = nounce;
+			if(rc && resNounce[0] > nounce)
+				resNounce[0] = nounce;
 		}
 	}
 #endif
@@ -111,9 +115,8 @@ void groestlcoin_cpu_free(int thr_id)
 __host__
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	uint32_t msgBlock[32];
+	uint32_t msgBlock[32] = { 0 };
 
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
 	memcpy(&msgBlock[0], data, 80);
 
 	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
@@ -125,18 +128,14 @@ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 	// auf der GPU ausgeführt)
 
 	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-						msgBlock,
-						128);
+	cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128);
 
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	cudaMemcpyToSymbol( pTarget,
-						pTargetIn,
-						sizeof(uint32_t) * 8 );
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 }
 
 __host__
-void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce)
 {
 	uint32_t threadsperblock = 256;
 
@@ -144,13 +143,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
 	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
 	int factor = 4;
 
-		// berechne wie viele Thread Blocks wir brauchen
+	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 		gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
@@ -158,10 +154,10 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
 	}
 
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+	groestlcoin_gpu_hash_quad <<<grid, block>>> (threads, startNounce, d_resultNonce[thr_id]);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, 0, thr_id);
+	// MyStreamSynchronize(NULL, 0, thr_id);
 
-	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h
index d4fa42e..e38f086 100644
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@@ -4,6 +4,6 @@
 void groestlcoin_cpu_init(int thr_id, uint32_t threads);
 void groestlcoin_cpu_free(int thr_id);
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce);
 
 #endif
\ No newline at end of file
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index 7598228..c8f123f 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -18,7 +18,7 @@
 
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-uint32_t *d_outputHashes[MAX_GPUS];
+static uint32_t *d_outputHashes[MAX_GPUS];
 static uint32_t *d_resultNonce[MAX_GPUS];
 
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
@@ -225,8 +225,9 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
 		message[k] = SWAB32(hash[k]);
 }
 
-__global__ void __launch_bounds__(256, 4)
- myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
+__global__
+__launch_bounds__(256, 4)
+void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
 #if __CUDA_ARCH__ >= 300
 	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@@ -263,8 +264,8 @@ __global__ void __launch_bounds__(256, 4)
 #endif
 }
 
-__global__ void
- myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
+__global__
+void myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
 {
 #if __CUDA_ARCH__ >= 300
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@@ -322,6 +323,9 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
 						myr_sha256_cpu_constantTable,
 						sizeof(uint32_t) * 64 );
 
+	// to check if the binary supports SM3+
+	cuda_get_arch(thr_id);
+
 	cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
@@ -379,7 +383,7 @@ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
 	myriadgroestl_gpu_hash_quad2 <<< grid2, block >>> (threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 
 	cudaMemcpy(resNounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index 1b0ec4e..96a6d38 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -36,8 +36,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
 	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
 
-	uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
-
 	if (opt_benchmark)
 		ptarget[7] = 0x001f;
 
@@ -66,7 +64,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		*hashes_done = pdata[19] - start_nonce + throughput;
 
 		// GPU hash
-		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
+		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &foundNounce);
 
 		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
@@ -77,9 +75,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				work_set_target_ratio(work, vhash);
 				pdata[19] = foundNounce;
-				free(outputHash);
 				return true;
-			} else {
+			} else if (vhash[7] > ptarget[7]) {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
@@ -93,8 +90,6 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - start_nonce;
-
-	free(outputHash);
 	return 0;
 }