blake: final cleanup (225MH/s)

10 years ago · 8925a7551f
1 changed files with 29 additions and 61 deletions
--- a/blake32.cu
+++ b/blake32.cu
@ -50,16 +50,19 @@ static uint32_t __align__(32) c_Target[8];
 __constant__
 static uint32_t __align__(32) c_data[20];
 /* 8 adapters max (-t threads) */
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
-static uint32_t extra_results[2] = { MAXU, MAXU };
+
 /* max count of found nounces in one call */
 #define NBN 2
 static uint32_t extra_results[NBN-1] = { MAXU };
 #define USE_CACHE 1
 /* midstate hash cache, this algo is run on 2 parts */
 #if USE_CACHE
-__device__
+__device__ static uint32_t cache[8];
-static uint32_t cache[8];
+__device__ static uint32_t prevsum = 0;
 __device__
 static uint32_t prevsum = 0;
 #endif
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
@ -104,30 +107,6 @@ static const uint32_t __align__(32) c_u256[16] = {
 	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
 };
 #if 0
 #define GS(m0, m1, c0, c1, a, b, c, d)   do { \
 		a = SPH_T32(a + b + (m0 ^ c1)); \
 		d = SPH_ROTR32(d ^ a, 16); \
 		c = SPH_T32(c + d); \
 		b = SPH_ROTR32(b ^ c, 12); \
 		a = SPH_T32(a + b + (m1 ^ c0)); \
 		d = SPH_ROTR32(d ^ a, 8); \
 		c = SPH_T32(c + d); \
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while (0)
 #define ROUND_S(r)   do { \
 	GS(Mx(r, 0x0), Mx(r, 0x1), CSx(r, 0x0), CSx(r, 0x1), v[0], v[4], v[0x8], v[0xC]); \
 	GS(Mx(r, 0x2), Mx(r, 0x3), CSx(r, 0x2), CSx(r, 0x3), v[1], v[5], v[0x9], v[0xD]); \
 	GS(Mx(r, 0x4), Mx(r, 0x5), CSx(r, 0x4), CSx(r, 0x5), v[2], v[6], v[0xA], v[0xE]); \
 	GS(Mx(r, 0x6), Mx(r, 0x7), CSx(r, 0x6), CSx(r, 0x7), v[3], v[7], v[0xB], v[0xF]); \
 	GS(Mx(r, 0x8), Mx(r, 0x9), CSx(r, 0x8), CSx(r, 0x9), v[0], v[5], v[0xA], v[0xF]); \
 	GS(Mx(r, 0xA), Mx(r, 0xB), CSx(r, 0xA), CSx(r, 0xB), v[1], v[6], v[0xB], v[0xC]); \
 	GS(Mx(r, 0xC), Mx(r, 0xD), CSx(r, 0xC), CSx(r, 0xD), v[2], v[7], v[0x8], v[0xD]); \
 	GS(Mx(r, 0xE), Mx(r, 0xF), CSx(r, 0xE), CSx(r, 0xF), v[3], v[4], v[0x9], v[0xE]); \
 } while (0)
 #endif
 #define GS(a,b,c,d,x) { \
 	const uint32_t idx1 = c_sigma[i][x]; \
 	const uint32_t idx2 = c_sigma[i][x+1]; \
@ -240,22 +219,21 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		ending[3] = nounce; /* our tested value */
 		blake256_compress(h, ending, 640, blakerounds);
-
+#if 0
 		int pos = -1;
 		#pragma unroll 8
 		for (int i = 7; i >= 0; i--) {
 			uint32_t hash = cuda_swab32(h[i]);
 			if (hash > c_Target[i]) {
 				return;
 			}
 			if (hash < c_Target[i]) {
-			/* dont ask me why, we lose 8MH/s in perfs
+				break;
 			   without the int variable */
 				if (pos < i) pos = i;
 				//break;
 			}
 		}
-
+#else
 		/* do not test all parts, fulltest() will do it */
 		if (cuda_swab32(h[7]) <= c_Target[7])
 #endif
 #if NBN == 2
 		/* keep the smallest nounce, + extra one if found */
 		if (resNounce[0] > nounce) {
 			resNounce[1] = resNounce[0];
@ -263,6 +241,9 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		}
 		else
 			resNounce[1] = nounce;
 #else
 		resNounce[0] = nounce;
 #endif
 	}
 }
@ -277,15 +258,16 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 	size_t shared_size = 0;
 	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
-	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+	if (cudaMemset(d_resNounce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds, crcsum);
 	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		cudaThreadSynchronize();
+		//cudaThreadSynchronize(); /* seems no more required */
 		result = h_resNounce[thr_id][0];
-		extra_results[0] = h_resNounce[thr_id][1];
+		for (int n=0; n < (NBN-1); n++)
 			extra_results[n] = h_resNounce[thr_id][n+1];
 	}
 	return result;
 }
@ -309,6 +291,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	uint32_t crcsum = MAXU;
 	int rc = 0;
 #if NBN > 1
 	if (extra_results[0] != MAXU) {
 		// possible extra result found in previous call
 		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
@ -319,6 +302,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			goto exit_scan;
 		}
 	}
 #endif
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x00000f;
@ -327,8 +311,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		if (opt_n_threads > 1) {
 			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		}
-		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t)));
-		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t)));
 		init[thr_id] = true;
 	}
@ -375,14 +359,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 				goto exit_scan;
 			}
-			else if (vhashcpu[7] > Htarg) {
+			else if (opt_debug) {
-				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 			else if (vhashcpu[6] > ptarget[6]) {
 				applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
 			}
 			else {
 				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
@ -397,15 +375,5 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
 #if 0
 	/* reset the device to allow multiple instances
 	 * could be made in cpu-miner... check later if required */
 	if (opt_n_threads == 1) {
 		CUDA_SAFE_CALL(cudaDeviceReset());
 		init[thr_id] = false;
 	}
 #endif
 	// wait proper end of all threads
 	//cudaDeviceSynchronize();
 	return rc;
 }