From e1aa028b9fa6249792889dd4675422c9e75e3940 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 18 Aug 2014 00:58:44 +0200 Subject: [PATCH] x15: optimize, adds some kH/s --- x15/cuda_x15_whirlpool.cu | 53 ++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 9892579..787a510 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -5,7 +5,7 @@ */ #include #include -#include +#include #define NULLTEST 0 @@ -13,12 +13,11 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t #define SPH_64 (1) #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1) -#define __constant #define SPH_C64(x) ((uint64_t)(x ## ULL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +// defined in cuda_helper.h +#define SPH_ROTL64(x,n) ROTL64(x,n) /* $Id: whirlpool.c 227 2010-06-16 17:28:38Z tp $ */ /* @@ -1225,7 +1224,6 @@ __global__ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uin struct h8x64 p = *phash; /* copy content in local p */ struct h8x64 st, n, h = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint8_t u; - #if NULLTEST p = h; #endif @@ -1239,23 +1237,17 @@ __global__ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uin ROUND_WENC(plain_T, n.n, h.n, t); } - h.n0 = st.n0 = n.n0 ^ p.n0; - h.n1 = st.n1 = n.n1 ^ p.n1; - h.n2 = st.n2 = n.n2 ^ p.n2; - h.n3 = st.n3 = n.n3 ^ p.n3; - h.n4 = st.n4 = n.n4 ^ p.n4; - h.n5 = st.n5 = n.n5 ^ p.n5; - h.n6 = st.n6 = n.n6 ^ p.n6; - h.n7 = st.n7 = n.n7 ^ p.n7; + n.n0 = h.n0 = st.n0 = n.n0 ^ p.n0; + n.n1 = h.n1 = st.n1 = n.n1 ^ p.n1; + n.n2 = h.n2 = st.n2 = n.n2 ^ p.n2; + n.n3 = h.n3 = st.n3 = n.n3 ^ p.n3; + n.n4 = h.n4 = st.n4 = n.n4 ^ p.n4; + n.n5 = h.n5 = st.n5 = n.n5 ^ p.n5; + n.n6 = h.n6 = st.n6 = n.n6 ^ p.n6; + n.n7 = h.n7 = st.n7 = n.n7 ^ p.n7; - n.n0 = st.n0 ^ 0x80; - n.n1 = st.n1; - n.n2 = st.n2; - n.n3 = st.n3; - n.n4 = st.n4; - n.n5 = st.n5; - n.n6 = st.n6; - n.n7 = st.n7 ^ 0x2000000000000; + n.n0 ^= 0x80; + n.n7 ^= 0x2000000000000; #pragma unroll 10 for (u = 0; u < 10; u++) @@ -1265,14 +1257,17 @@ __global__ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uin ROUND_WENC(plain_T, n.n, h.n, t); } - phash->n0 = st.n0 ^ (n.n0 ^ 0x80); - phash->n1 = st.n1 ^ n.n1; - phash->n2 = st.n2 ^ n.n2; - phash->n3 = st.n3 ^ n.n3; - phash->n4 = st.n4 ^ n.n4; - phash->n5 = st.n5 ^ n.n5; - phash->n6 = st.n6 ^ n.n6; - phash->n7 = st.n7 ^ (n.n7 ^ 0x2000000000000); + n.n0 ^= 0x80; + n.n7 ^= 0x2000000000000; + + phash->n0 = st.n0 ^ n.n0; + phash->n1 = st.n1 ^ n.n1; + phash->n2 = st.n2 ^ n.n2; + phash->n3 = st.n3 ^ n.n3; + phash->n4 = st.n4 ^ n.n4; + phash->n5 = st.n5 ^ n.n5; + phash->n6 = st.n6 ^ n.n6; + phash->n7 = st.n7 ^ n.n7; } }