blake80: some changes and launch bounds, no perf changes

2025-08-27 06:12:40 +00:00 · 2015-04-24 14:12:21 +02:00 · 2015-04-24 14:12:21 +02:00 · 2113be6eec
commit 2113be6eec
parent d1966cc085
8 changed files with 87 additions and 107 deletions
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -14,8 +14,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -99,14 +99,14 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -7,14 +7,13 @@
 #define USE_SHUFFLE 0
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+__constant__
 static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
-__constant__ uint8_t c_sigma[16][16];
+__device__ __constant__
-
+static const uint8_t c_sigma[16][16] = {
 const uint8_t host_sigma[16][16] =
 {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
@ -30,11 +29,10 @@ const uint8_t host_sigma[16][16] =
 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }};
 };
 __device__ __constant__
-const uint64_t c_u512[16] =
+static const uint64_t c_u512[16] =
 {
 	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
 	0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -111,14 +109,8 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
 // Hash-Padding
 __device__ __constant__
 static const uint64_t d_constHashPadding[8] = {
-	0x0000000000000080ull,
+	0x0000000000000080ull, 0, 0, 0,
-	0,
+	0, 0x0100000000000000ull, 0, 0x0002000000000000ull
 	0,
 	0,
 	0,
 	0x0100000000000000ull,
 	0,
 	0x0002000000000000ull
 };
 __global__ __launch_bounds__(256, 4)
@ -171,14 +163,14 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
 #if __CUDA_ARCH__ <= 350
-		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
+		uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
 			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		uint64_t *outHash = &g_hash[8 * hashPosition];
+		uint64_t *outHash = &g_hash[hashPosition * 8U];
 		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
@ -186,13 +178,20 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 	}
 }
-__global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+__global__ __launch_bounds__(256,4)
 void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t buf[16];
-		uint32_t nounce = startNounce + thread;
+		#pragma unroll
 		for (int i=0; i < 16; ++i)
 			buf[i] = c_PaddedMessage80[i];
 		// The test Nonce
 		const uint32_t nounce = startNounce + thread;
 		((uint32_t*)buf)[19] = cuda_swab32(nounce);
 		uint64_t h[8] = {
 			0x6a09e667f3bcc908ULL,
@ -205,30 +204,21 @@ __global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounc
 			0x5be0cd19137e2179ULL
 		};
 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
 		for (int i=0; i < 16; ++i)
 			buf[i] = c_PaddedMessage80[i];
 		// The test Nonce
 		((uint32_t*)buf)[19] = cuda_swab32(nounce);
 		quark_blake512_compress(h, buf, c_sigma, c_u512, 640);
 #if __CUDA_ARCH__ <= 350
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+		uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
 		#pragma unroll 8
 		for (uint32_t i=0; i < 8; i++) {
 			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U);
 		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
 	}
 }
@ -238,30 +228,24 @@ __global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounc
 __host__
 void quark_blake512_cpu_init(int thr_id, uint32_t threads)
 {
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	CUDA_SAFE_CALL(cudaGetLastError());
 	CUDA_CALL_OR_RET( cudaMemcpyToSymbol(c_sigma,
 						host_sigma,
 						sizeof(host_sigma),
 						0, cudaMemcpyHostToDevice));
 }
 // Blake512 für 80 Byte grosse Eingangsdaten
 __host__
-void quark_blake512_cpu_setBlock_80(void *pdata)
+void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata)
 {
-	// Message mit Padding bereitstellen
+	uint64_t message[16];
 	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 	PaddedMessage[80] = 0x80;
 	PaddedMessage[111] = 1;
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
-	CUDA_SAFE_CALL(
+	memcpy(message, pdata, 80);
-		cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)
+	message[10] = 0x80;
-	);
+	message[11] = 0;
 	message[12] = 0;
 	message[13] = 0x0100000000000000ull;
 	message[14] = 0;
 	message[15] = 0x8002000000000000ull; // 0x280
 	cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
 __host__
@ -269,18 +253,16 @@ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 {
 	const uint32_t threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__
-void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 256;
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -20,8 +20,8 @@ static uint32_t *d_branch2Nonces[MAX_GPUS];
 static uint32_t *d_branch3Nonces[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
@ -170,7 +170,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
@ -178,7 +178,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		size_t nrm1=0, nrm2=0, nrm3=0;
 		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		// das ist der unbedingte Branch für BMW512
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -23,8 +23,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -181,8 +181,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
@ -190,7 +189,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce;
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		TRACE("blake  :");
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("bmw    :");
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -25,10 +25,9 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -188,14 +187,14 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		uint32_t foundNonce;
 		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -30,8 +30,8 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -197,12 +197,12 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -31,8 +31,8 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -208,12 +208,12 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -34,8 +34,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -230,14 +230,14 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);