From 1fbcbbacc4b7e1f71a81ca0cde40b7ad0d9ae619 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 20 Aug 2014 07:49:22 +0200 Subject: [PATCH] Add whirlcoin and optimize x11 luffa (maxrregcount) --- Makefile.am | 17 +- config.sh | 1 + cpu-miner.c | 8 + cuda_helper.h | 5 +- miner.h | 5 + quark/cuda_bmw512.cu | 6 - quark/cuda_quark_keccak512.cu | 336 +++++++++++++++++----------------- quark/cuda_skein512.cu | 2 +- util.c | 4 + x11/cuda_x11_luffa512.cu | 10 +- x15/whirlcoin.cu | 117 ++++++++++++ 11 files changed, 327 insertions(+), 184 deletions(-) create mode 100644 x15/whirlcoin.cu diff --git a/Makefile.am b/Makefile.am index a8cc782..635a530 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,23 +40,30 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ - x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu + x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlcoin.cu ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME +nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v +nvcc_FLAGS += $(JANSSON_INCLUDES) + # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $< + +# Luffa is faster with 80 registers than 128 +x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu + $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< # Shavite compiles faster with 128 regs x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $< # ABI requiring code modules quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $< JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $< diff --git a/config.sh b/config.sh index d3f74d0..04ef6f9 100755 --- a/config.sh +++ b/config.sh @@ -7,6 +7,7 @@ make clean || echo clean +rm -f Makefile.in rm -f config.status ./autogen.sh || echo done diff --git a/cpu-miner.c b/cpu-miner.c index 688f13f..4dcb71b 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -134,6 +134,7 @@ typedef enum { ALGO_ANIME, ALGO_FRESH, ALGO_NIST5, + ALGO_WHC, ALGO_X11, ALGO_X13, ALGO_X14, @@ -152,6 +153,7 @@ static const char *algo_names[] = { "anime", "fresh", "nist5", + "whirlcoin", "x11", "x13", "x14", @@ -229,6 +231,7 @@ Options:\n\ anime Animecoin hash\n\ fresh Freshcoin hash (shavite 80)\n\ nist5 NIST5 (TalkCoin) hash\n\ + whirlcoin Whirlcoin hash\n\ x11 X11 (DarkCoin) hash\n\ x13 X13 (MaruCoin) hash\n\ x14 X14 hash\n\ @@ -931,6 +934,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_WHC: + rc = scanhash_whc(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_X11: rc = scanhash_x11(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/cuda_helper.h b/cuda_helper.h index 3a7e400..c2c9f3e 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -238,7 +238,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n) // 64-bit ROTATE RIGHT -#ifdef DJM_SM35_ROT64 +#if __CUDA_ARCH__ >= 350 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { @@ -274,8 +274,7 @@ uint64_t ROTR64(const uint64_t x, const int offset) #endif // 64-bit ROTATE LEFT -#ifdef DJM_SM35_ROT64 -/* complicated sm >= 3.5 one, to bench */ +#if __CUDA_ARCH__ >= 350 __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { uint2 result; diff --git a/miner.h b/miner.h index 81965c4..2b0dad1 100644 --- a/miner.h +++ b/miner.h @@ -242,6 +242,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_whc(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_x11(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -362,6 +366,7 @@ void myriadhash(void *state, const void *input); void fresh_hash(void *state, const void *input); void nist5hash(void *state, const void *input); void quarkhash(void *state, const void *input); +void wcoinhash(void *state, const void *input); void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index d5e3f4a..3e4b5bf 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -1,5 +1,3 @@ -#if 1 - #include #include @@ -11,9 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t // die Message it Padding zur Berechnung auf der GPU __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) @@ -320,4 +315,3 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo MyStreamSynchronize(NULL, order, thr_id); } -#endif diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 2a6f2dc..b205100 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -1,165 +1,171 @@ -#include -#include - -#include "cuda_helper.h" - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -#define U32TO64_LE(p) \ - (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) - -#define U64TO32_LE(p, v) \ - *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); - -__device__ __constant__ -static const uint64_t c_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -static __device__ __forceinline__ void -keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { - size_t i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ -#pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -__global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - - // Nachricht kopieren - uint32_t message[18]; -#pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = inpHash[i]; - - message[16] = 0x01; - message[17] = 0x80000000; - - // State initialisieren - uint64_t keccak_gpu_state[25]; -#pragma unroll 25 - for (int i=0; i<25; i++) - keccak_gpu_state[i] = 0; - - // den Block einmal gut durchschütteln - keccak_block(keccak_gpu_state, message, c_keccak_round_constants); - - // das Hash erzeugen - uint32_t hash[16]; - -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } - - // fertig - uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; - -#pragma unroll 16 - for(int i=0;i<16;i++) - outpHash[i] = hash[i]; - } -} - -// Setup-Funktionen -__host__ void quark_keccak512_cpu_init(int thr_id, int threads) -{ -} - -__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - - quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); -} +#include +#include + +#include "cuda_helper.h" + +// heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +#define U32TO64_LE(p) \ + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + +#define U64TO32_LE(p, v) \ + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint64_t c_keccak_round_constants[24]; + +static __device__ __forceinline__ void +keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ +#pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +__global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; + + // Nachricht kopieren + uint32_t message[18]; +#pragma unroll 16 + for(int i=0;i<16;i++) + message[i] = inpHash[i]; + + message[16] = 0x01; + message[17] = 0x80000000; + + // State initialisieren + uint64_t keccak_gpu_state[25]; +#pragma unroll 25 + for (int i=0; i<25; i++) + keccak_gpu_state[i] = 0; + + // den Block einmal gut durchschütteln + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + // das Hash erzeugen + uint32_t hash[16]; + +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); + } + + // fertig + uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; + +#pragma unroll 16 + for(int i=0;i<16;i++) + outpHash[i] = hash[i]; + } +} + +// Setup-Funktionen +__host__ void quark_keccak512_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice); +} + +__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +{ + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + + quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 7008807..ef40f29 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -289,7 +289,7 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t } static __constant__ uint64_t d_constMem[8]; -static uint64_t h_constMem[8] = { +static const uint64_t h_constMem[8] = { SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), SPH_C64(0x8FD1934127C79BCE), diff --git a/util.c b/util.c index dfd05b0..81cc85b 100644 --- a/util.c +++ b/util.c @@ -1364,6 +1364,10 @@ void print_hash_tests(void) fresh_hash(&hash[0], &buf[0]); printf("\nfresh: "); print_hash(hash); + memset(hash, 0, sizeof hash); + wcoinhash(&hash[0], &buf[0]); + printf("\nwhirlc: "); print_hash(hash); + memset(hash, 0, sizeof hash); x11hash(&hash[0], &buf[0]); printf("\nX11: "); print_hash(hash); diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu index a976271..eb7c56f 100644 --- a/x11/cuda_x11_luffa512.cu +++ b/x11/cuda_x11_luffa512.cu @@ -90,8 +90,8 @@ typedef struct { b0 ^= c1; /* initial values of chaining variables */ -__device__ __constant__ -const uint32_t c_IV[40] = { +__device__ __constant__ uint32_t c_IV[40]; +const uint32_t h_IV[40] = { 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, 0x6e292011,0x90152df4,0xee058139,0xdef610bb, 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, @@ -103,8 +103,8 @@ const uint32_t c_IV[40] = { 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; -__device__ __constant__ -uint32_t c_CNS[80] = { +__device__ __constant__ uint32_t c_CNS[80]; +const uint32_t h_CNS[80] = { 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, @@ -356,6 +356,8 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint // Setup-Funktionen __host__ void x11_luffa512_cpu_init(int thr_id, int threads) { + cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); } __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) diff --git a/x15/whirlcoin.cu b/x15/whirlcoin.cu new file mode 100644 index 0000000..59d5c5d --- /dev/null +++ b/x15/whirlcoin.cu @@ -0,0 +1,117 @@ +/* + * whirlpool routine (djm) + */ +extern "C" +{ +#include "sph/sph_whirlpool.h" +#include "miner.h" +} + +// from cpu-miner.c +extern int device_map[8]; +extern bool opt_benchmark; + +// Speicher für Input/Output der verketteten Hashfunktionen +static uint32_t *d_hash[8]; + +extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode); +extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget); +extern void whirlpool512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + +// CPU Hash function +extern "C" void wcoinhash(void *state, const void *input) +{ + sph_whirlpool_context ctx_whirlpool; + + uint32_t hash[16]; + + // shavite 1 + sph_whirlpool1_init(&ctx_whirlpool); + sph_whirlpool1(&ctx_whirlpool, input, 80); + sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + + sph_whirlpool1_init(&ctx_whirlpool); + sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); + sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + + sph_whirlpool1_init(&ctx_whirlpool); + sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); + sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + + sph_whirlpool1_init(&ctx_whirlpool); + sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); + sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + + memcpy(state, hash, 32); +} + +extern "C" int scanhash_whc(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; + uint32_t endiandata[20]; + uint32_t Htarg = ptarget[7]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; + + if (!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); + // Konstanten kopieren, Speicher belegen + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + x15_whirlpool_cpu_init(thr_id, throughput,1); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) { + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + } + + whirlpool512_setBlock_80((void*)endiandata, ptarget); + + do { + uint32_t foundNonce; + int order = 0; + + whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + + wcoinhash(vhash64, endiandata); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[19] = foundNonce; + *hashes_done = foundNonce - first_nonce + 1; + return 1; + } + else if (vhash64[7] > Htarg) { + applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg); + } + else { + applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + } + } + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +}