From ed4927fcd0473925ae123ee54eb0ee5fbda81af2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 30 May 2015 14:57:52 +0200 Subject: [PATCH] quark/x11: set signed int hashPosition vars to off_t groestl (and keccak?) seems faster with 64bit vars (off_t or int64_t)... --- quark/cuda_bmw512.cu | 2 +- quark/cuda_bmw512_30.cu | 6 +++--- quark/cuda_jh512.cu | 5 ++--- quark/cuda_quark_blake512.cu | 10 +++------- quark/cuda_quark_groestl512.cu | 18 ++++++++---------- quark/cuda_quark_groestl512_sm20.cu | 6 +++--- quark/cuda_quark_keccak512.cu | 10 +++++----- quark/cuda_skein512.cu | 2 +- 8 files changed, 26 insertions(+), 33 deletions(-) diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index 37c6497..bcf3c46 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -428,7 +428,7 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t * Compression512(h, message); // fertig - uint64_t *outpHash = &g_hash[8 * thread]; + uint64_t *outpHash = &g_hash[thread * 8]; #pragma unroll 8 for(int i=0;i<8;i++) diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu index d14795b..e0edf81 100644 --- a/quark/cuda_bmw512_30.cu +++ b/quark/cuda_bmw512_30.cu @@ -151,8 +151,8 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_ { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + uint32_t hashPosition = nounce - startNounce; + uint64_t *inpHash = &g_hash[hashPosition * 8]; // Init uint64_t h[16]; @@ -205,7 +205,7 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_ Compression512_30(h, message); - uint64_t *outpHash = &g_hash[8 * hashPosition]; + uint64_t *outpHash = &g_hash[hashPosition * 8]; #pragma unroll 8 for(int i=0;i<8;i++) diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index 623e598..1444c80 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -229,7 +229,6 @@ static __device__ __forceinline__ void E8(uint32_t x[8][4]) } } -// Die Hash-Funktion __global__ __launch_bounds__(256, 4) void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { @@ -237,8 +236,8 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *c if (thread < threads) { const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - const uint32_t hashPosition = nounce - startNounce; - uint32_t *const Hash = &g_hash[hashPosition * 16U]; + uint32_t hashPosition = nounce - startNounce; + uint32_t *Hash = &g_hash[hashPosition * 16U]; uint32_t x[8][4] = { { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index a2454a0..c7f0df6 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -1,4 +1,4 @@ -#include +#include #include #include "cuda_helper.h" @@ -119,12 +119,8 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); #if USE_SHUFFLE - const int warpID = threadIdx.x & 0x0F; // 16 warps - const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke - const int maxHashPosition = thread<<3; -#endif + const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke -#if USE_SHUFFLE if (warpBlockID < ( (threads+15)>>4 )) #else if (thread < threads) @@ -132,7 +128,7 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; + off_t hashPosition = nounce - startNounce; uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 // 128 Bytes diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 6aa8645..1580b27 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -28,19 +28,19 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u uint32_t state[8]; uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition << 4]; + off_t hashPosition = nounce - startNounce; + uint32_t *pHash = &g_hash[hashPosition << 4]; - const uint16_t thr = threadIdx.x % THF; + const uint32_t thr = threadIdx.x % THF; #pragma unroll - for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr]; + for(int k=0;k<4;k++) message[k] = pHash[thr + (k * THF)]; #pragma unroll for(int k=4;k<8;k++) message[k] = 0; - if (thr == 0) message[4] = 0x80; - if (thr == 3) message[7] = 0x01000000; + if (thr == 0) message[4] = 0x80U; + if (thr == 3) message[7] = 0x01000000U; uint32_t msgBitsliced[8]; to_bitslice_quad(message, msgBitsliced); @@ -48,15 +48,13 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u groestl512_progressMessage_quad(state, msgBitsliced); // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash - uint32_t *outpHash = inpHash; uint32_t hash[16]; from_bitslice_quad(state, hash); - // uint4 = 4x4 uint32_t = 16 bytes if (thr == 0) { uint4 *phash = (uint4*) hash; - uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */ + uint4 *outpt = (uint4*) pHash; outpt[0] = phash[0]; outpt[1] = phash[1]; outpt[2] = phash[2]; @@ -85,7 +83,7 @@ __global__ void __launch_bounds__(TPB, THF) uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; + off_t hashPosition = nounce - startNounce; uint32_t * inpHash = &g_hash[hashPosition<<4]; const uint16_t thr = threadIdx.x % THF; diff --git a/quark/cuda_quark_groestl512_sm20.cu b/quark/cuda_quark_groestl512_sm20.cu index bf3c750..ea710c8 100644 --- a/quark/cuda_quark_groestl512_sm20.cu +++ b/quark/cuda_quark_groestl512_sm20.cu @@ -230,8 +230,8 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32 uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[16 * hashPosition]; + off_t hashPosition = nounce - startNounce; + uint32_t *inpHash = &g_hash[hashPosition * 16]; #pragma unroll 16 for(int k=0; k<16; k++) @@ -266,7 +266,7 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32 for(int u=0;u<32;u++) state[u] ^= message[u]; // Erzeugten Hash rausschreiben - uint32_t *outpHash = &g_hash[16 * hashPosition]; + uint32_t *outpHash = &g_hash[hashPosition * 16]; #pragma unroll 16 for(int k=0;k<16;k++) outpHash[k] = state[k+16]; diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 21ee856..de89505 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -101,8 +101,8 @@ void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_ { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + off_t hashPosition = nounce - startNounce; + uint64_t *inpHash = &g_hash[hashPosition * 8]; uint2 keccak_gpu_state[25]; for (int i = 0; i<8; i++) { @@ -200,8 +200,8 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; + off_t hashPosition = nounce - startNounce; + uint32_t *inpHash = (uint32_t*)&g_hash[hashPosition * 8]; uint32_t message[18]; #pragma unroll 16 @@ -224,7 +224,7 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); } - uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; + uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition * 8]; #pragma unroll 16 for(int i=0; i<16; i++) outpHash[i] = hash[i]; diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 24e7da6..011f5d1 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -512,7 +512,7 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; + uint32_t hashPosition = nounce - startNounce; uint64_t *inpHash = &g_hash[hashPosition * 8]; // Init