tribus: handle jh80 midstate (+10%)

7 years ago · 7ce3f2ccce
1 changed files with 90 additions and 1 deletions
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@ -1,3 +1,8 @@
				@@ -1,3 +1,8 @@
+/**
+ * JH512 64 and 80 kernels
+ *
+ * JH80 by tpruvot - 2017 - under GPLv3
+ **/
 #include <cuda_helper.h>

 // #include <stdio.h>  // printf
@ -335,7 +340,7 @@ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
				@@ -335,7 +340,7 @@ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 // Setup function
 __host__ void  quark_jh512_cpu_init(int thr_id, uint32_t threads) {}

-#define WANT_JH80
+#define WANT_JH80_MIDSTATE
 #ifdef WANT_JH80

 __constant__
@ -417,3 +422,87 @@ void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t
				@@ -417,3 +422,87 @@ void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t
 }

 #endif
+
+#ifdef WANT_JH80_MIDSTATE
+
+__constant__ static uint32_t c_JHState[32];
+__constant__ static uint32_t c_Message[4];
+
+__global__
+void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// 1 (precomputed state)
+		uint32_t x[8][4];
+		AS_UINT4(&x[0][0]) = AS_UINT4(&c_JHState[ 0]);
+		AS_UINT4(&x[1][0]) = AS_UINT4(&c_JHState[ 4]);
+		AS_UINT4(&x[2][0]) = AS_UINT4(&c_JHState[ 8]);
+		AS_UINT4(&x[3][0]) = AS_UINT4(&c_JHState[12]);
+
+		AS_UINT4(&x[4][0]) = AS_UINT4(&c_JHState[16]);
+		AS_UINT4(&x[5][0]) = AS_UINT4(&c_JHState[20]);
+		AS_UINT4(&x[6][0]) = AS_UINT4(&c_JHState[24]);
+		AS_UINT4(&x[7][0]) = AS_UINT4(&c_JHState[28]);
+
+		// 2 (16 bytes with nonce)
+		uint32_t h[4];
+		AS_UINT2(&h[0]) = AS_UINT2(&c_Message[0]);
+		h[2] = c_Message[2];
+		h[3] = cuda_swab32(startNounce + thread);
+
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[0][i] ^= h[i];
+		x[1][0] ^= 0x80U;
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[4][i] ^= h[i];
+		x[5][0] ^= 0x80U;
+
+		// 3 close
+		x[3][3] ^= 0x80020000U; // 80 bytes = 640bits (0x280)
+		E8(x);
+		x[7][3] ^= 0x80020000U;
+
+		uint32_t *Hash = &g_outhash[(size_t)16 * thread];
+		AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
+		AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
+		AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
+		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
+	}
+}
+
+__host__
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
+}
+
+extern "C" {
+#undef SPH_C32
+#undef SPH_T32
+#undef SPH_C64
+#undef SPH_T64
+#include <sph/sph_jh.h>
+}
+
+__host__
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	sph_jh512_context ctx_jh;
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, endiandata, 64);
+
+	cudaMemcpyToSymbol(c_JHState, ctx_jh.H.narrow, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_Message, &endiandata[16], sizeof(c_Message), 0, cudaMemcpyHostToDevice);
+}
+
+#endif