correct implementation of gostd

2025-08-26 05:42:02 +00:00 · 2017-04-25 10:47:02 -04:00 · 2017-04-25 10:47:02 -04:00 · 879dc8a9ca
commit 879dc8a9ca
parent 449aeb2282
4 changed files with 44 additions and 85 deletions
--- a/bench.cpp
+++ b/bench.cpp
@ -76,6 +76,7 @@ void algo_free_all(int thr_id)
 	free_skein2(thr_id);
 	free_sha256d(thr_id);
 	free_sha256t(thr_id);
+	free_gostd(thr_id);
 	free_sia(thr_id);
 	free_sib(thr_id);
 	free_s3(thr_id);
--- a/gost/cuda_gosthash.cu
+++ b/gost/cuda_gosthash.cu
@ -1068,35 +1068,6 @@ void GOST_hash_X(uint64_t *hash, uchar * const message, uint64_t len)
 	GOST_g_N(hash, Sigma, M);
 }

-__global__
-__launch_bounds__(128, 3)
-void streebog_gpu_hash_64(uint32_t threads, uint64_t *g_hash) // 80 bytes input
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint64_t* inout = (&g_hash[thread * 8U]);
-		uint64_t hash[8] = { 0 }; //iv 
-		GOST_hash_X(hash, (uchar*) inout, 640);
-		GOST_Copy512(inout, hash);
-	}
-}
-
-__global__
-__launch_bounds__(128, 3)
-void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) // 64 bytes input
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint64_t* inout = (&g_hash[thread * 8U]);
-		uint64_t hash[8];
-		memset (&hash, 1, 64); // iv
-		GOST_hash_X(hash, (uchar*) inout, 512);
-		GOST_Copy256(inout, hash);
-	}
-}
-
 __global__
 /*__launch_bounds__(256,3)*/
 void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
@ -1128,26 +1099,6 @@ void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32
 	}
 }

-__host__
-void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
-{
-	const int threadsperblock = 128;
-	dim3 grid((threads + threadsperblock-1) / threadsperblock);
-	dim3 block(threadsperblock);
-
-	streebog_gpu_hash_64<<<grid, block>>>(threads, (uint64_t*)d_hash);
-}
-
-__host__
-void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
-{
-	const int threadsperblock = 128;
-	dim3 grid((threads + threadsperblock-1) / threadsperblock);
-	dim3 block(threadsperblock);
-
-	streebog_gpu_hash_32<<<grid, block>>>(threads, (uint64_t*)d_hash);
-}
-
 __host__
 void gostd_init(int thr_id)
 {
--- a/gost/gost.cu
+++ b/gost/gost.cu
@ -9,9 +9,6 @@ extern "C" {
 #include <memory.h>

 #define NBN 2
-static uint32_t *d_hash[MAX_GPUS];
-static uint32_t *d_resNonce[MAX_GPUS];
-static uint32_t *h_resNonce[MAX_GPUS];

 // GOST CPU Hash
 extern "C" void gosthash(void *output, const void *input)
@ -32,28 +29,28 @@ extern "C" void gosthash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-extern void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);

 //#define _DEBUG
 #define _DEBUG_PREFIX "gost"
 #include "cuda_debug.cuh"

 static bool init[MAX_GPUS] = { 0 };
+extern void gostd_init(int thr_id);
+extern void gostd_free(int thr_id);
+extern void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);

 extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	const int dev_id = device_map[thr_id];
-	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
-	if (device_sm[dev_id] >= 600) intensity = 20;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 25);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));

 	if (opt_benchmark)
-		ptarget[7] = 0xf;
+		((uint32_t*)ptarget)[7] = 0x03;

 	if (!init[thr_id])
 	{
@ -66,50 +63,44 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 80 * throughput), -1);
-
-		cuda_check_cpu_init(thr_id, throughput);
+		gostd_init(thr_id);

 		init[thr_id] = true;
 	}

-	for (int k=0; k < 20; k++)
-		be32enc(&d_hash[thr_id][k], pdata[k]);
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	gostd_setBlock_80(endiandata, ptarget);

 	do {
-		int order = 0;
-
 		// Hash with CUDA
-		gost_hash_64(thr_id, throughput, d_hash[thr_id]);
-		TRACE("gost64   :");
-		gost_hash_32(thr_id, throughput, d_hash[thr_id]);
-		TRACE("gost32   :");
+		*hashes_done = pdata[19] - first_nonce + throughput;

-		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		gostd_hash_80(thr_id, throughput, pdata[19], work->nonces);
 		if (work->nonces[0] != UINT32_MAX)
 		{
-			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
-			be32enc(&d_hash[thr_id][19], work->nonces[0]);
-			gosthash(vhash, d_hash[thr_id]);

-			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+			endiandata[19] = swab32(work->nonces[0]);
+			gosthash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				work->nonces[1] =cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (work->nonces[1] != 0) {
-					be32enc(&d_hash[thr_id][19], work->nonces[1]);
-					sibhash(vhash, d_hash[thr_id]);
-					bn_set_target_ratio(work, vhash, 1);
-					work->valid_nonces++;
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					gosthash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
-					pdata[19] = work->nonces[0] + 1; // cursor
+					pdata[19] = work->nonces[0] + 1;
 				}
 				return work->valid_nonces;
 			}
-			else if (vhash[7] > Htarg) {
+			else if (vhash[7] > ptarget[7]) {
 				gpu_increment_reject(thr_id);
 				if (!opt_quiet)
 					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
@ -122,6 +113,7 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 			pdata[19] = max_nonce;
 			break;
 		}
+
 		pdata[19] += throughput;

 	} while (!work_restart[thr_id].restart);
@ -131,3 +123,17 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 	return 0;
 }

+// cleanup
+extern "C" void free_gostd(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	gostd_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/miner.h
+++ b/miner.h
@ -358,6 +358,7 @@ extern void free_quark(int thr_id);
 extern void free_qubit(int thr_id);
 extern void free_sha256d(int thr_id);
 extern void free_sha256t(int thr_id);
+extern void free_gostd(int thr_id);
 extern void free_sia(int thr_id);
 extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);