keccak: not compatible with second nonces (was broken)

Use djm34 new uint2 method to get a +40% boost (115 to 153MH/s)
10 years ago · ef8a73d6aa
4 changed files with 51 additions and 48 deletions
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -7,6 +7,12 @@ extern "C" {
				@@ -7,6 +7,12 @@ extern "C" {

 #include "cuda_helper.h"

+#ifdef _MSC_VER
+#define UINT2(x,y) { x, y }
+#else
+#define UINT2(x,y) (uint2) { x, y }
+#endif
+
 static const uint64_t host_keccak_round_constants[24] = {
 	0x0000000000000001ull, 0x0000000000008082ull,
 	0x800000000000808aull, 0x8000000080008000ull,
@ -97,7 +103,7 @@ static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
				@@ -97,7 +103,7 @@ static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
 		s[0] ^= vectorize(keccak_round_constants[i]);
 	}
 }
-#endif
+#else

 __device__ __forceinline__
 static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
@ -167,43 +173,52 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
				@@ -167,43 +173,52 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
 		s[0] ^= keccak_round_constants[i];
 	}
 }
+#endif

-__global__
+__global__ __launch_bounds__(128,5)
 void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
-		uint64_t keccak_gpu_state[25];

-		//#pragma unroll 25
+#if __CUDA_ARCH__ >= 350
+		uint2 keccak_gpu_state[25];
+		#pragma unroll 25
 		for (int i=0; i<25; i++) {
-			if (i < 9)
-				keccak_gpu_state[i] = c_PaddedMessage80[i];
-			else
-				keccak_gpu_state[i] = 0;
+			if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
+			else     keccak_gpu_state[i] = UINT2(0, 0);
+		}
+
+		keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
+		keccak_gpu_state[9].y = cuda_swab32(nounce);
+		keccak_gpu_state[10] = UINT2(1, 0);
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+
+		keccak_blockv35(keccak_gpu_state,keccak_round_constants);
+		if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
+#else
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++) {
+			if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
+			else     keccak_gpu_state[i] = 0;
 		}
 		keccak_gpu_state[9]  = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
 		keccak_gpu_state[10] = 0x0000000000000001;
 		keccak_gpu_state[16] = 0x8000000000000000;

 		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
-
-		bool rc = false;
-		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
-
-		if (rc == true) {
-			if(resNounce[0] > nounce)
-				resNounce[0] = nounce;
-		}
+		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
+#endif
 	}
 }

 __host__
 uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
-	uint32_t result = 0xffffffff;
+	uint32_t result = UINT32_MAX;
 	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 128;

@ -222,12 +237,6 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
				@@ -222,12 +237,6 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
 	return result;
 }

-#ifdef _MSC_VER
-#define UINT2(a, b) { a, b }
-#else
-#define UINT2(a, b) (uint2) { a, b }
-#endif
-
 __global__ __launch_bounds__(256,3)
 void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
@ -238,18 +247,16 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa
				@@ -238,18 +247,16 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
-			if (i < 4)
-				keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
-			else
-				keccak_gpu_state[i] = UINT2(0, 0);
+			if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
+			else     keccak_gpu_state[i] = UINT2(0, 0);
 		}
 		keccak_gpu_state[4]  = UINT2(1, 0);
 		keccak_gpu_state[16] = UINT2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state, keccak_round_constants);

 		#pragma unroll 4
-		for (int i=0; i<4;i++)
-			outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]);
+		for (int i=0; i<4; i++)
+			outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
 #else
 		uint64_t keccak_gpu_state[25];
 		#pragma unroll 25
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@ -41,13 +41,12 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
				@@ -41,13 +41,12 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
+	throughput = min(throughput, (int)(max_nonce - first_nonce));

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0005;

-	int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
-	throughput = min(throughput, (int)(max_nonce - first_nonce));
-
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);

@ -75,15 +74,9 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
				@@ -75,15 +74,9 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 			keccak256_hash(vhash64, endiandata);

 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
-				}
+				*hashes_done = foundNonce - first_nonce + 1;
 				pdata[19] = foundNonce;
-				return res;
+				return 1;
 			}
 			else {
 				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
--- a/Makefile.am
+++ b/Makefile.am
@ -79,15 +79,12 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
				@@ -79,15 +79,12 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 .cu.o:
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<

-blake32.o: blake32.cu
+Algo256/blake256.o: Algo256/blake256.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

 heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

-keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
-	$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
-
 qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -1100,9 +1100,11 @@ static void *miner_thread(void *userdata)
				@@ -1100,9 +1100,11 @@ static void *miner_thread(void *userdata)
 			case ALGO_BLAKE:
 				minmax = 0x80000000U;
 				break;
+			case ALGO_KECCAK:
+				minmax = 0x40000000U;
+				break;
 			case ALGO_DOOM:
 			case ALGO_JACKPOT:
-			case ALGO_KECCAK:
 			case ALGO_LUFFA_DOOM:
 				minmax = 0x2000000;
 				break;
@ -1157,6 +1159,9 @@ static void *miner_thread(void *userdata)
				@@ -1157,6 +1159,9 @@ static void *miner_thread(void *userdata)
 			}
 		}
 #endif
+		if (opt_algo == ALGO_KECCAK && max64 == UINT32_MAX) {
+			max64 = 0x7FFFFFFFUL;
+		}
 		/* never let small ranges at end */
 		if (end_nonce >= UINT32_MAX - 256)
 			end_nonce = UINT32_MAX;
@ -1406,9 +1411,10 @@ out:
				@@ -1406,9 +1411,10 @@ out:

 static void restart_threads(void)
 {
-	int i;
+	if (opt_debug)
+		applog(LOG_DEBUG,"%s", __FUNCTION__);

-	for (i = 0; i < opt_n_threads; i++)
+	for (int i = 0; i < opt_n_threads; i++)
 		work_restart[i].restart = 1;
 }