From 2113be6eecddaee1573c108b666e7a508193466c Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 24 Apr 2015 14:12:21 +0200
Subject: [PATCH] blake80: some changes and launch bounds, no perf changes

---
 cuda_nist5.cu                |   8 +-
 quark/cuda_quark_blake512.cu | 138 +++++++++++++++--------------------
 quark/quarkcoin.cu           |   8 +-
 x11/x11.cu                   |   9 +--
 x13/x13.cu                   |   9 +--
 x15/x14.cu                   |   8 +-
 x15/x15.cu                   |   8 +-
 x17/x17.cu                   |   8 +-
 8 files changed, 88 insertions(+), 108 deletions(-)

diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index e691615..8287ca1 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -14,8 +14,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -99,14 +99,14 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
 
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 24d2975..ca335db 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -7,43 +7,41 @@
 
 #define USE_SHUFFLE 0
 
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+__constant__
+static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 
 // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
 
-__constant__ uint8_t c_sigma[16][16];
-
-const uint8_t host_sigma[16][16] =
-{
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
-};
+__device__ __constant__
+static const uint8_t c_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }};
 
 __device__ __constant__
-const uint64_t c_u512[16] =
+static const uint64_t c_u512[16] =
 {
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
-  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
-  0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
-  0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
-  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+	0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+	0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+	0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+	0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+	0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+	0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+	0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
 #define G(a,b,c,d,x) { \
@@ -111,14 +109,8 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
 // Hash-Padding
 __device__ __constant__
 static const uint64_t d_constHashPadding[8] = {
-	0x0000000000000080ull,
-	0,
-	0,
-	0,
-	0,
-	0x0100000000000000ull,
-	0,
-	0x0002000000000000ull
+	0x0000000000000080ull, 0, 0, 0,
+	0, 0x0100000000000000ull, 0, 0x0002000000000000ull
 };
 
 __global__ __launch_bounds__(256, 4)
@@ -171,14 +163,14 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
 
 #if __CUDA_ARCH__ <= 350
-		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
+		uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
 			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		uint64_t *outHash = &g_hash[8 * hashPosition];
+		uint64_t *outHash = &g_hash[hashPosition * 8U];
 		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
@@ -186,13 +178,20 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 	}
 }
 
-__global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+__global__ __launch_bounds__(256,4)
+void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t buf[16];
-		uint32_t nounce = startNounce + thread;
+		#pragma unroll
+		for (int i=0; i < 16; ++i)
+			buf[i] = c_PaddedMessage80[i];
+
+		// The test Nonce
+		const uint32_t nounce = startNounce + thread;
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
 
 		uint64_t h[8] = {
 			0x6a09e667f3bcc908ULL,
@@ -205,30 +204,21 @@ __global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounc
 			0x5be0cd19137e2179ULL
 		};
 
-		// Message für die erste Runde in Register holen
-		#pragma unroll 16
-		for (int i=0; i < 16; ++i)
-			buf[i] = c_PaddedMessage80[i];
-
-		// The test Nonce
-		((uint32_t*)buf)[19] = cuda_swab32(nounce);
-
-		quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
+		quark_blake512_compress(h, buf, c_sigma, c_u512, 640);
 
 #if __CUDA_ARCH__ <= 350
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+		uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
 		#pragma unroll 8
 		for (uint32_t i=0; i < 8; i++) {
 			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U);
 		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
-
 	}
 }
 
@@ -238,30 +228,24 @@ __global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounc
 __host__
 void quark_blake512_cpu_init(int thr_id, uint32_t threads)
 {
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	CUDA_CALL_OR_RET( cudaMemcpyToSymbol(c_sigma,
-						host_sigma,
-						sizeof(host_sigma),
-						0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
-// Blake512 für 80 Byte grosse Eingangsdaten
 __host__
-void quark_blake512_cpu_setBlock_80(void *pdata)
+void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata)
 {
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	PaddedMessage[80] = 0x80;
-	PaddedMessage[111] = 1;
-	PaddedMessage[126] = 0x02;
-	PaddedMessage[127] = 0x80;
-
-	CUDA_SAFE_CALL(
-		cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)
-	);
+	uint64_t message[16];
+
+	memcpy(message, pdata, 80);
+	message[10] = 0x80;
+	message[11] = 0;
+	message[12] = 0;
+	message[13] = 0x0100000000000000ull;
+	message[14] = 0;
+	message[15] = 0x8002000000000000ull; // 0x280
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
 __host__
@@ -269,18 +253,16 @@ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 {
 	const uint32_t threadsperblock = 256;
 
-	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
 	quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
 
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 __host__
-void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 256;
 
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 2fda2da..8f0d4d8 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -20,8 +20,8 @@ static uint32_t *d_branch2Nonces[MAX_GPUS];
 static uint32_t *d_branch3Nonces[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
@@ -170,7 +170,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
@@ -178,7 +178,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		size_t nrm1=0, nrm2=0, nrm3=0;
 
 		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 
 		// das ist der unbedingte Branch fÃ¼r BMW512
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/x11/x11.cu b/x11/x11.cu
index fce4997..cfef513 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -23,8 +23,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -181,8 +181,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
@@ -190,7 +189,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce;
 
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		TRACE("blake  :");
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("bmw    :");
diff --git a/x13/x13.cu b/x13/x13.cu
index 77d01d7..40d107f 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -25,10 +25,9 @@ extern "C"
 
 static uint32_t *d_hash[MAX_GPUS];
 
-
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -188,14 +187,14 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		uint32_t foundNonce;
 		int order = 0;
 
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/x15/x14.cu b/x15/x14.cu
index abd398f..02a5b2d 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -30,8 +30,8 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -197,12 +197,12 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/x15/x15.cu b/x15/x15.cu
index 2a220ec..d3a6c78 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -31,8 +31,8 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -208,12 +208,12 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/x17/x17.cu b/x17/x17.cu
index 6aa74a3..6d7e071 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -34,8 +34,8 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -230,14 +230,14 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
 
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);