gostd_hash

2025-03-13 05:51:19 +00:00 · 2017-04-25 10:14:49 -04:00 · 2017-04-25 10:14:49 -04:00 · 449aeb2282
commit 449aeb2282
parent bc098e2471
1 changed files with 78 additions and 0 deletions
--- a/gost/cuda_gosthash.cu
+++ b/gost/cuda_gosthash.cu
@ -14,6 +14,10 @@

 typedef unsigned char uchar;

+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+__constant__ static uint32_t __align__(8) c_header[19];
+__constant__ static uint32_t __align__(8) c_target[2]; // up to 64 bits
+
 //#define FULL_UNROLL

 #define memcpy(dst, src, len) { \
@ -1093,6 +1097,37 @@ void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) // 64 bytes input
 	}
 }

+__global__
+/*__launch_bounds__(256,3)*/
+void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t dat[20];
+		#pragma unroll
+		for (int i = 0; i < 19; i++) dat[i] = c_header[i];
+		dat[19] = nonce;
+		uint64_t hash1[8] = { 0 }; //iv	for 512
+		GOST_hash_X(hash1, (uchar*)dat, 640); // 80 bytes
+		uint64_t hash[8];
+		#pragma unroll
+		for (int i=0; i<64; i++) ((uchar *)hash)[i] = 1; // iv for 256
+		GOST_hash_X(hash, (uchar *)hash1, 512); // 64 bytes
+		// result is first 32 bytes of hash
+
+		// check nonce
+		uint64_t high = MAKE_ULONGLONG(cuda_swab32(_LODWORD(hash[3])), cuda_swab32(_HIDWORD(hash[3]))); // swab uint64_t
+		if (high <= c_target[0]) 
+		{
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
 __host__
 void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
@ -1112,3 +1147,46 @@ void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)

 	streebog_gpu_hash_32<<<grid, block>>>(threads, (uint64_t*)d_hash);
 }
+
+__host__
+void gostd_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void gostd_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t buf[19];
+	for (int i=0;i<19;i++) buf[i] = cuda_swab32(pdata[i]);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_header, buf, 76, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	gostd_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}