From ed4927fcd0473925ae123ee54eb0ee5fbda81af2 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 30 May 2015 14:57:52 +0200
Subject: [PATCH] quark/x11: set signed int hashPosition vars to off_t

groestl (and keccak?) seems faster with 64bit vars (off_t or int64_t)...
---
 quark/cuda_bmw512.cu                |  2 +-
 quark/cuda_bmw512_30.cu             |  6 +++---
 quark/cuda_jh512.cu                 |  5 ++---
 quark/cuda_quark_blake512.cu        | 10 +++-------
 quark/cuda_quark_groestl512.cu      | 18 ++++++++----------
 quark/cuda_quark_groestl512_sm20.cu |  6 +++---
 quark/cuda_quark_keccak512.cu       | 10 +++++-----
 quark/cuda_skein512.cu              |  2 +-
 8 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index 37c6497..bcf3c46 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -428,7 +428,7 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
 		Compression512(h, message);
 
 		// fertig
-		uint64_t *outpHash = &g_hash[8 * thread];
+		uint64_t *outpHash = &g_hash[thread * 8];
 
 #pragma unroll 8
 		for(int i=0;i<8;i++)
diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu
index d14795b..e0edf81 100644
--- a/quark/cuda_bmw512_30.cu
+++ b/quark/cuda_bmw512_30.cu
@@ -151,8 +151,8 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_
 	{
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
+		uint32_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition * 8];
 
 		// Init
 		uint64_t h[16];
@@ -205,7 +205,7 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_
 
 		Compression512_30(h, message);
 
-		uint64_t *outpHash = &g_hash[8 * hashPosition];
+		uint64_t *outpHash = &g_hash[hashPosition * 8];
 
 		#pragma unroll 8
 		for(int i=0;i<8;i++)
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index 623e598..1444c80 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -229,7 +229,6 @@ static __device__ __forceinline__ void E8(uint32_t x[8][4])
 	}
 }
 
-// Die Hash-Funktion
 __global__ __launch_bounds__(256, 4)
 void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
@@ -237,8 +236,8 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *c
 	if (thread < threads)
 	{
 		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		const uint32_t hashPosition = nounce - startNounce;
-		uint32_t *const Hash = &g_hash[hashPosition * 16U];
+		uint32_t hashPosition = nounce - startNounce;
+		uint32_t *Hash = &g_hash[hashPosition * 16U];
 		uint32_t x[8][4] = {
 			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
 			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index a2454a0..c7f0df6 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -1,4 +1,4 @@
-#include <stdio.h>
+ï»¿#include <stdio.h>
 #include <memory.h>
 
 #include "cuda_helper.h"
@@ -119,12 +119,8 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 #if USE_SHUFFLE
-	const int warpID = threadIdx.x & 0x0F; // 16 warps
-	const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke
-	const int maxHashPosition = thread<<3;
-#endif
+	const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-BlÃ¶cke
 
-#if USE_SHUFFLE
 	if (warpBlockID < ( (threads+15)>>4 ))
 #else
 	if (thread < threads)
@@ -132,7 +128,7 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 	{
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
+		off_t hashPosition = nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
 
 		// 128 Bytes
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 6aa8645..1580b27 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -28,19 +28,19 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
         uint32_t state[8];
 
         uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition << 4];
+        off_t hashPosition = nounce - startNounce;
+        uint32_t *pHash = &g_hash[hashPosition << 4];
 
-        const uint16_t thr = threadIdx.x % THF;
+        const uint32_t thr = threadIdx.x % THF;
 
         #pragma unroll
-        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
+        for(int k=0;k<4;k++) message[k] = pHash[thr + (k * THF)];
 
         #pragma unroll
         for(int k=4;k<8;k++) message[k] = 0;
 
-        if (thr == 0) message[4] = 0x80;
-        if (thr == 3) message[7] = 0x01000000;
+        if (thr == 0) message[4] = 0x80U;
+        if (thr == 3) message[7] = 0x01000000U;
 
         uint32_t msgBitsliced[8];
         to_bitslice_quad(message, msgBitsliced);
@@ -48,15 +48,13 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
         groestl512_progressMessage_quad(state, msgBitsliced);
 
         // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = inpHash;
         uint32_t hash[16];
         from_bitslice_quad(state, hash);
 
-
         // uint4 = 4x4 uint32_t = 16 bytes
         if (thr == 0) {
             uint4 *phash = (uint4*) hash;
-            uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
+            uint4 *outpt = (uint4*) pHash;
             outpt[0] = phash[0];
             outpt[1] = phash[1];
             outpt[2] = phash[2];
@@ -85,7 +83,7 @@ __global__ void __launch_bounds__(TPB, THF)
 
         uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
 
-        int hashPosition = nounce - startNounce;
+        off_t hashPosition = nounce - startNounce;
         uint32_t * inpHash = &g_hash[hashPosition<<4];
         const uint16_t thr = threadIdx.x % THF;
 
diff --git a/quark/cuda_quark_groestl512_sm20.cu b/quark/cuda_quark_groestl512_sm20.cu
index bf3c750..ea710c8 100644
--- a/quark/cuda_quark_groestl512_sm20.cu
+++ b/quark/cuda_quark_groestl512_sm20.cu
@@ -230,8 +230,8 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
 
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = &g_hash[16 * hashPosition];
+		off_t hashPosition = nounce - startNounce;
+		uint32_t *inpHash = &g_hash[hashPosition * 16];
 
 		#pragma unroll 16
 		for(int k=0; k<16; k++)
@@ -266,7 +266,7 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 
 		// Erzeugten Hash rausschreiben
-		uint32_t *outpHash = &g_hash[16 * hashPosition];
+		uint32_t *outpHash = &g_hash[hashPosition * 16];
 
 		#pragma unroll 16
 		for(int k=0;k<16;k++) outpHash[k] = state[k+16];
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 21ee856..de89505 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -101,8 +101,8 @@ void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_
 	{
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
+		off_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition * 8];
 		uint2 keccak_gpu_state[25];
 
 		for (int i = 0; i<8; i++) {
@@ -200,8 +200,8 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin
 	{
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
+		off_t hashPosition = nounce - startNounce;
+		uint32_t *inpHash = (uint32_t*)&g_hash[hashPosition * 8];
 
 		uint32_t message[18];
 		#pragma unroll 16
@@ -224,7 +224,7 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin
 			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
 		}
 
-		uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
+		uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition * 8];
 		#pragma unroll 16
 		for(int i=0; i<16; i++)
 			outpHash[i] = hash[i];
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index 24e7da6..011f5d1 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -512,7 +512,7 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint
 
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
+		uint32_t hashPosition = nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition * 8];
 
 		// Init