From a37e909db9ff747f22203eb6bb3b4c6ed746f44f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 27 Mar 2015 11:21:13 +0100 Subject: [PATCH] Add zr5 algo (for SM 3.5+) uint4 copy + keccak cleanup, groestl: small uint4 opt Signed-off-by: Tanguy Pruvot --- JHA/cuda_jha_keccak512.cu | 1083 ++++++++++++++++++-------------- Makefile.am | 2 +- README.txt | 12 +- ccminer.cpp | 84 ++- ccminer.vcxproj | 14 +- ccminer.vcxproj.filters | 9 +- configure.ac | 2 +- cpuminer-config.h | 30 +- miner.h | 5 + quark/cuda_quark_groestl512.cu | 15 +- util.cpp | 39 +- zr5.cu | 342 ++++++++++ 12 files changed, 1110 insertions(+), 527 deletions(-) create mode 100644 zr5.cu diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index d0d3c9b..2fc26b2 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -3,155 +3,118 @@ #include "cuda_helper.h" -__constant__ uint64_t c_State[25]; +// ZR5 +__constant__ uint32_t d_OriginalData[20]; + __constant__ uint32_t c_PaddedMessage[18]; +__constant__ uint64_t c_State[25]; + +#define POK_DATA_MASK 0xFFFF0000 +#define POK_VERSION 0x1 #define U32TO64_LE(p) \ - (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) #define U64TO32_LE(p, v) \ - *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); static const uint64_t host_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull }; __constant__ uint64_t c_keccak_round_constants[24]; static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { - size_t i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ -#pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -__global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = startNounce + thread; - - int hashPosition = nounce - startNounce; - - // Nachricht kopieren - uint32_t message[18]; -#pragma unroll 18 - for(int i=0;i<18;i++) - message[i] = c_PaddedMessage[i]; - - // die individuelle Nounce einsetzen - message[1] = cuda_swab32(nounce); - - // State initialisieren - uint64_t keccak_gpu_state[25]; -#pragma unroll 25 - for (int i=0; i<25; i++) - keccak_gpu_state[i] = c_State[i]; - - // den Block einmal gut durchschütteln - keccak_block(keccak_gpu_state, message, c_keccak_round_constants); - - // das Hash erzeugen - uint32_t hash[16]; - -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } - - // fertig - uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; - -#pragma unroll 16 - for(int i=0;i<16;i++) - outpHash[i] = hash[i]; - } + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ + #pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } } // Setup-Funktionen -__host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) +__host__ +void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) { - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice); } #define cKeccakB 1600 @@ -160,379 +123,549 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) #define cKeccakR_SizeInBytes (cKeccakR / 8) #define crypto_hash_BYTES 64 -#if (cKeccakB == 1600) - typedef unsigned long long UINT64; - typedef UINT64 tKeccakLane; - #define cKeccakNumberOfRounds 24 +#if (cKeccakB == 1600) + typedef unsigned long long UINT64; + typedef UINT64 tKeccakLane; + #define cKeccakNumberOfRounds 24 #endif #define cKeccakLaneSizeInBits (sizeof(tKeccakLane) * 8) #define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits)))) #if ((cKeccakB/25) == 8) - #define ROL_mult8(a, offset) ((tKeccakLane)a) + #define ROL_mult8(a, offset) ((tKeccakLane)a) #else - #define ROL_mult8(a, offset) ROL(a, offset) + #define ROL_mult8(a, offset) ROL(a, offset) #endif -void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ); -const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = -{ - (tKeccakLane)0x0000000000000001ULL, - (tKeccakLane)0x0000000000008082ULL, - (tKeccakLane)0x800000000000808aULL, - (tKeccakLane)0x8000000080008000ULL, - (tKeccakLane)0x000000000000808bULL, - (tKeccakLane)0x0000000080000001ULL, - (tKeccakLane)0x8000000080008081ULL, - (tKeccakLane)0x8000000000008009ULL, - (tKeccakLane)0x000000000000008aULL, - (tKeccakLane)0x0000000000000088ULL, - (tKeccakLane)0x0000000080008009ULL, - (tKeccakLane)0x000000008000000aULL, - (tKeccakLane)0x000000008000808bULL, - (tKeccakLane)0x800000000000008bULL, - (tKeccakLane)0x8000000000008089ULL, - (tKeccakLane)0x8000000000008003ULL, - (tKeccakLane)0x8000000000008002ULL, - (tKeccakLane)0x8000000000000080ULL - #if (cKeccakB >= 400) +const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = { + (tKeccakLane)0x0000000000000001ULL, + (tKeccakLane)0x0000000000008082ULL, + (tKeccakLane)0x800000000000808aULL, + (tKeccakLane)0x8000000080008000ULL, + (tKeccakLane)0x000000000000808bULL, + (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008009ULL, + (tKeccakLane)0x000000000000008aULL, + (tKeccakLane)0x0000000000000088ULL, + (tKeccakLane)0x0000000080008009ULL, + (tKeccakLane)0x000000008000000aULL, + (tKeccakLane)0x000000008000808bULL, + (tKeccakLane)0x800000000000008bULL, + (tKeccakLane)0x8000000000008089ULL, + (tKeccakLane)0x8000000000008003ULL, + (tKeccakLane)0x8000000000008002ULL, + (tKeccakLane)0x8000000000000080ULL +#if (cKeccakB >= 400) , (tKeccakLane)0x000000000000800aULL, - (tKeccakLane)0x800000008000000aULL - #if (cKeccakB >= 800) + (tKeccakLane)0x800000008000000aULL +#if (cKeccakB >= 800) , (tKeccakLane)0x8000000080008081ULL, - (tKeccakLane)0x8000000000008080ULL - #if (cKeccakB == 1600) + (tKeccakLane)0x8000000000008080ULL +#if (cKeccakB == 1600) , (tKeccakLane)0x0000000080000001ULL, - (tKeccakLane)0x8000000080008008ULL - #endif - #endif - #endif + (tKeccakLane)0x8000000080008008ULL +#endif +#endif +#endif }; -void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) +void KeccakF(tKeccakLane * state, const tKeccakLane *in, int laneCount) { - - { - while ( --laneCount >= 0 ) - { - state[laneCount] ^= in[laneCount]; - } - } - - { - tKeccakLane Aba, Abe, Abi, Abo, Abu; - tKeccakLane Aga, Age, Agi, Ago, Agu; - tKeccakLane Aka, Ake, Aki, Ako, Aku; - tKeccakLane Ama, Ame, Ami, Amo, Amu; - tKeccakLane Asa, Ase, Asi, Aso, Asu; - tKeccakLane BCa, BCe, BCi, BCo, BCu; - tKeccakLane Da, De, Di, Do, Du; - tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu; - tKeccakLane Ega, Ege, Egi, Ego, Egu; - tKeccakLane Eka, Eke, Eki, Eko, Eku; - tKeccakLane Ema, Eme, Emi, Emo, Emu; - tKeccakLane Esa, Ese, Esi, Eso, Esu; - #define round laneCount - - //copyFromState(A, state) - Aba = state[ 0]; - Abe = state[ 1]; - Abi = state[ 2]; - Abo = state[ 3]; - Abu = state[ 4]; - Aga = state[ 5]; - Age = state[ 6]; - Agi = state[ 7]; - Ago = state[ 8]; - Agu = state[ 9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for( round = 0; round < cKeccakNumberOfRounds; round += 2 ) - { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (tKeccakLane)KeccakF_RoundConstants[round]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL_mult8(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL_mult8(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL_mult8(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL_mult8(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - //copyToState(state, A) - state[ 0] = Aba; - state[ 1] = Abe; - state[ 2] = Abi; - state[ 3] = Abo; - state[ 4] = Abu; - state[ 5] = Aga; - state[ 6] = Age; - state[ 7] = Agi; - state[ 8] = Ago; - state[ 9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - - #undef round - } + while ( --laneCount >= 0 ) { + state[laneCount] ^= in[laneCount]; + } + + { + tKeccakLane Aba, Abe, Abi, Abo, Abu; + tKeccakLane Aga, Age, Agi, Ago, Agu; + tKeccakLane Aka, Ake, Aki, Ako, Aku; + tKeccakLane Ama, Ame, Ami, Amo, Amu; + tKeccakLane Asa, Ase, Asi, Aso, Asu; + tKeccakLane BCa, BCe, BCi, BCo, BCu; + tKeccakLane Da, De, Di, Do, Du; + tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu; + tKeccakLane Ega, Ege, Egi, Ego, Egu; + tKeccakLane Eka, Eke, Eki, Eko, Eku; + tKeccakLane Ema, Eme, Emi, Emo, Emu; + tKeccakLane Esa, Ese, Esi, Eso, Esu; + #define round laneCount + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < cKeccakNumberOfRounds; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (tKeccakLane)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL_mult8(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL_mult8(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL_mult8(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL_mult8(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round + } } // inlen kann 72...143 betragen -__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) +__host__ +void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) +{ + const unsigned char *in = (const unsigned char*)pdata; + + tKeccakLane state[5 * 5]; + unsigned char temp[cKeccakR_SizeInBytes]; + + memset( state, 0, sizeof(state) ); + + for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes) + { + KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); + } + + // Copy state of the first round (72 Bytes) + // in Constant Memory + cudaMemcpyToSymbol( c_State, + state, + sizeof(state), + 0, cudaMemcpyHostToDevice); + + // second part + memcpy(temp, in, inlen); + temp[inlen++] = 1; + memset(temp + inlen, 0, cKeccakR_SizeInBytes - inlen); + temp[cKeccakR_SizeInBytes-1] |= 0x80; + + // Copy rest of the message in constant memory + cudaMemcpyToSymbol( c_PaddedMessage, + temp, + cKeccakR_SizeInBytes, + 0, cudaMemcpyHostToDevice); +} + +__global__ +void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + + int hashPosition = nounce - startNounce; + + uint32_t message[18]; + #pragma unroll 18 + for(int i=0;i<18;i++) + message[i] = c_PaddedMessage[i]; + + message[1] = cuda_swab32(nounce); + + // State init + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i=0; i<25; i++) + keccak_gpu_state[i] = c_State[i]; + + // den Block einmal gut durchschütteln + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + uint32_t hash[16]; + + #pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); + } + + // copy hash + uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; + + #pragma unroll 16 + for(int i=0;i<16;i++) + outpHash[i] = hash[i]; + } +} + +__host__ +void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) { - const unsigned char *in = (const unsigned char*)pdata; + const uint32_t threadsperblock = 256; - tKeccakLane state[5 * 5]; - unsigned char temp[cKeccakR_SizeInBytes]; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - memset( state, 0, sizeof(state) ); + size_t shared_size = 0; - for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes ) - { - KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); - } + jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); + MyStreamSynchronize(NULL, order, thr_id); +} - // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) - // ins Constant Memory - cudaMemcpyToSymbol( c_State, - state, - sizeof(state), - 0, cudaMemcpyHostToDevice); - // padding - memcpy( temp, in, (size_t)inlen ); - temp[inlen++] = 1; - memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen ); - temp[cKeccakR_SizeInBytes-1] |= 0x80; +/* zr5 keccak, no nonce swab32 */ +__global__ +void zr5_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + uint32_t message[18]; + + #pragma unroll 18 + for(int i=0; i<18; i++) + message[i] = c_PaddedMessage[i]; + + message[1] = nounce; + + // Get mid-state + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i=0; i<25; i++) + keccak_gpu_state[i] = c_State[i]; + + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + uint32_t hash[16]; + + #pragma unroll 8 + for (int i = 0; i < 8; i++) { + U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]); + } + + // Output (64 bytes hash required) + uint32_t hashPosition = nounce - startNounce; + //uint32_t *outpHash = (uint32_t*) (&g_hash[hashPosition*8]); + //#pragma unroll 16 + //for(int i=0; i<16; i++) + // outpHash[i] = hash[i]; + + uint4 *outpHash = (uint4*) (&g_hash[hashPosition*8]); + uint4 *psrc = (uint4*) hash; + outpHash[0] = psrc[0]; + outpHash[1] = psrc[1]; + outpHash[2] = psrc[2]; + outpHash[3] = psrc[3]; + } +} + +__host__ +void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 256; - // Kopiere den Rest der Message und das Padding ins Constant Memory - cudaMemcpyToSymbol( c_PaddedMessage, - temp, - cKeccakR_SizeInBytes, - 0, cudaMemcpyHostToDevice); + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + zr5_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); + MyStreamSynchronize(NULL, 0, thr_id); } -__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +/* required for the second hash part of zr5 */ + +__global__ +void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_pokh, uint32_t version) { - const uint32_t threadsperblock = 256; + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + + //uint32_t hashPosition = thread * 16; + uint32_t *prevHash = &g_hash[thread * 16]; // thread * 64 / sizeof(uint32_t) + uint32_t message[18]; /* 72 bytes */ + + // pok - hash[0] from prev hash + message[0] = version | (prevHash[0] & POK_DATA_MASK); + // save pok + d_pokh[thread] = (uint16_t) (message[0] / 0x10000); + for (int i=1; i<18; i++) { + message[i]=d_OriginalData[i]; + } + + // first bloc + uint64_t keccak_gpu_state[25] = { 0 }; + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + // second bloc + message[0] = d_OriginalData[18]; + message[1] = nounce; //cuda_swab32(nounce); + message[2] = 1; + + #pragma unroll + for(int i=3; i<17; i++) + message[i] = 0; + + message[17] = 0x80000000UL; + + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + uint32_t hash[16]; + + #pragma unroll 8 + for (size_t i = 0; i < 8; i++) { + U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]); + } + + //uint32_t *outpHash = &g_hash[thread * 16]; + //#pragma unroll 16 + //for(int i=0; i<16; i++) + // outpHash[i] = hash[i]; + + uint4 *outpHash = (uint4*) (&g_hash[thread * 16]); + uint4 *psrc = (uint4*) hash; + outpHash[0] = psrc[0]; + outpHash[1] = psrc[1]; + outpHash[2] = psrc[2]; + outpHash[3] = psrc[3]; + } +} - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); +__host__ +void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks) +{ + const uint32_t threadsperblock = 256; + const uint32_t version = pdata[0] & (~POK_DATA_MASK); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); - MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice); + zr5_keccak512_gpu_hash_pok<<>>(threads, startNounce, d_hash, d_poks, version); + MyStreamSynchronize(NULL, 10, thr_id); } diff --git a/Makefile.am b/Makefile.am index 21641f9..46e7f3c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_quark_compactionTest.cu \ - cuda_nist5.cu pentablake.cu \ + cuda_nist5.cu pentablake.cu zr5.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ diff --git a/README.txt b/README.txt index 8d4e17a..eb23210 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.6.0-tpruvot (Mar 2015) - "Pluck & Whirlpoolx" +ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX" --------------------------------------------------------------- *************************************************************** @@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds) Keccak (Maxcoin) Deep, Doom and Qubit Pentablake (Blake 512 x5) -S3 (OneCoin) -Lyra2RE (new VertCoin algo) +1Coin Triple S +Vertcoin Lyra2RE +Ziftrcoin (ZR5) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL Only) implementations. @@ -84,6 +85,7 @@ its command line interface and options. x14 use to mine X14Coin x15 use to mine Halcyon x17 use to mine X17 + zr5 use to mine ZiftrCoin -d, --devices gives a comma separated list of CUDA device IDs to operate on. Device IDs start counting from 0! @@ -183,12 +185,12 @@ features. >>> RELEASE HISTORY <<< - Mar. 2015 v1.6.0 (Note for CryptoMiningBlog: NOT YET RELEASED/FINISHED!) + Mar. 27th 2015 v1.6.0 + Add the ZR5 Algo for Ziftcoin Import pluck (djm34) and whirlpoolx (alexis78) algos Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs) Default config file (also help to debug without command line) Various small fixes - More to come soon... Feb. 11th 2015 v1.5.3 Fix anime algo diff --git a/ccminer.cpp b/ccminer.cpp index 87c016d..e67e167 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -108,6 +108,7 @@ enum sha_algos { ALGO_X14, ALGO_X15, ALGO_X17, + ALGO_ZR5, }; static const char *algo_names[] = { @@ -140,6 +141,7 @@ static const char *algo_names[] = { "x14", "x15", "x17", + "zr5", }; bool opt_debug = false; @@ -166,7 +168,7 @@ static const bool opt_time = true; static enum sha_algos opt_algo = ALGO_X11; int opt_n_threads = 0; int opt_affinity = -1; -int opt_priority = 3; +int opt_priority = 0; static double opt_difficulty = 1; // CH bool opt_trust_pool = false; uint16_t opt_vote = 9999; @@ -193,6 +195,7 @@ int api_thr_id = -1; bool stratum_need_reset = false; struct work_restart *work_restart = NULL; struct stratum_ctx stratum = { 0 }; +uint32_t zr5_pok = 0; pthread_mutex_t applog_lock; static pthread_mutex_t stats_lock; @@ -254,6 +257,7 @@ Options:\n\ x17 X17 (peoplecurrency)\n\ whirl Whirlcoin (old whirlpool)\n\ whirlpoolx Vanilla coin\n\ + zr5 ZR5 (ZiftrCoin)\n\ -d, --devices Comma separated list of CUDA devices to use.\n\ Device IDs start counting from 0! Alternatively takes\n\ string names of your cards like gtx780ti or gt640#2\n\ @@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work) int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target); int i; + if (opt_algo == ALGO_ZR5) { + data_size = 80; adata_sz = 20; + } + if (unlikely(!jobj_binary(val, "data", work->data, data_size))) { applog(LOG_ERR, "JSON inval data"); return false; @@ -564,12 +572,12 @@ static int share_result(int result, const char *reason) if (reason) { applog(LOG_WARNING, "reject reason: %s", reason); - if (strncmp(reason, "low difficulty share", 20) == 0) { + if (strncasecmp(reason, "low difficulty", 14) == 0) { opt_difficulty = (opt_difficulty * 2.0) / 3.0; applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty); return 0; } - if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) { + if (strncasecmp(reason, "duplicate", 9) == 0 && !check_dups) { applog(LOG_WARNING, "enabling duplicates check feature"); check_dups = true; } @@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) } } - if (stale_work) { + if (opt_algo == ALGO_ZR5 && !stale_work) { + stale_work = (memcmp(&work->data[1], &g_work.data[1], 68)); + } + + if (!submit_old && stale_work) { if (opt_debug) applog(LOG_WARNING, "stale work detected, discarding"); return true; @@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work) uint16_t nvote; char *ntimestr, *noncestr, *xnonce2str, *nvotestr; - le32enc(&ntime, work->data[17]); - le32enc(&nonce, work->data[19]); - + switch (opt_algo) { + case ALGO_ZR5: + check_dups = true; + be32enc(&ntime, work->data[17]); + be32enc(&nonce, work->data[19]); + break; + default: + le32enc(&ntime, work->data[17]); + le32enc(&nonce, work->data[19]); + } noncestr = bin2hex((const uchar*)(&nonce), 4); if (check_dups) @@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work) } else { + int data_size = sizeof(work->data); + int adata_sz = ARRAY_SIZE(work->data); + /* build hex string */ char *str = NULL; + if (opt_algo == ALGO_ZR5) { + data_size = 80; adata_sz = 20; + } + if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) { - for (int i = 0; i < ARRAY_SIZE(work->data); i++) + for (int i = 0; i < adata_sz; i++) le32enc(work->data + i, work->data[i]); } - str = bin2hex((uchar*)work->data, sizeof(work->data)); + str = bin2hex((uchar*)work->data, data_size); if (unlikely(!str)) { applog(LOG_ERR, "submit_upstream_work OOM"); return false; @@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); work->data[17] = le32dec(sctx->job.ntime); work->data[18] = le32dec(sctx->job.nbits); - if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY) - { + + switch (opt_algo) { + case ALGO_MJOLLNIR: + case ALGO_HEAVY: + // todo: check if 19 is enough for (i = 0; i < 20; i++) work->data[i] = be32dec((uint32_t *)&work->data[i]); + break; + case ALGO_ZR5: + for (i = 0; i < 19; i++) + work->data[i] = be32dec((uint32_t *)&work->data[i]); + break; } work->data[20] = 0x80000000; @@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata) // &work.data[19] int wcmplen = 76; + int wcmpoft = 0; uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if (have_stratum) { @@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata) hashlog_purge_job(work.job_id); } } - if (memcmp(work.data, g_work.data, wcmplen)) { + + if (opt_algo == ALGO_ZR5) { + // ignore pok/version header + wcmpoft = 1; + wcmplen -= 4; + } + + if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) { #if 0 if (opt_debug) { for (int n=0; n <= (wcmplen-8); n+=8) { @@ -1497,19 +1539,24 @@ static void *miner_thread(void *userdata) case ALGO_X14: rc = scanhash_x14(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X15: rc = scanhash_x15(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X17: rc = scanhash_x17(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; + case ALGO_ZR5: { + rc = scanhash_zr5(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + } default: /* should never happen */ goto out; @@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata) if (rc > 1 && work.data[21]) { work.data[19] = work.data[21]; work.data[21] = 0; + if (opt_algo == ALGO_ZR5) { + // todo: use + 4..6 index for pok to allow multiple nonces + work.data[0] = work.data[22]; // pok + work.data[22] = 0; + } if (!submit_work(mythr, &work)) break; } @@ -1675,10 +1727,10 @@ start: submit_old = soval ? json_is_true(soval) : false; pthread_mutex_lock(&g_work_lock); if (work_decode(json_object_get(val, "result"), &g_work)) { + restart_threads(); if (!opt_quiet) applog(LOG_BLUE, "%s detected new block", short_url); g_work_time = time(NULL); - restart_threads(); } pthread_mutex_unlock(&g_work_lock); json_decref(val); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 0311a35..d917465 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -105,8 +105,9 @@ false 80 true - false - compute_30,sm_30;compute_50,sm_50 + true + compute_50,sm_50 + true @@ -176,8 +177,8 @@ false 80 true - false - compute_50,sm_50; + true + compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52 --ptxas-options="-O2" %(AdditionalOptions) @@ -221,7 +222,7 @@ 80 true false - compute_50,sm_50 + compute_50,sm_50;compute_52,sm_52 64 @@ -355,6 +356,7 @@ + true @@ -509,4 +511,4 @@ - \ No newline at end of file + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index e6173c2..376e847 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -379,9 +379,6 @@ Source Files\CUDA - - Source Files\CUDA - Source Files\CUDA @@ -571,6 +568,12 @@ Source Files\CUDA + + Source Files\CUDA + + + Source Files\CUDA\quark + diff --git a/configure.ac b/configure.ac index fe99b0c..fd49c12 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.6-git]) +AC_INIT([ccminer], [1.6]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpuminer-config.h b/cpuminer-config.h index 225d7d5..c3b94c4 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -63,7 +63,7 @@ #define HAVE_STRING_H 1 /* Define to 1 if you have the header file. */ -#define HAVE_SYSLOG_H 1 +/* #undef HAVE_SYSLOG_H */ /* Define to 1 if you have the header file. */ /* #undef HAVE_SYS_ENDIAN_H */ @@ -108,7 +108,7 @@ /* #undef LIBCURL_FEATURE_SSPI */ /* Defined if libcurl supports DICT */ -#define LIBCURL_PROTOCOL_DICT 1 +/* #undef LIBCURL_PROTOCOL_DICT */ /* Defined if libcurl supports FILE */ #define LIBCURL_PROTOCOL_FILE 1 @@ -123,28 +123,28 @@ #define LIBCURL_PROTOCOL_HTTP 1 /* Defined if libcurl supports HTTPS */ -#define LIBCURL_PROTOCOL_HTTPS 1 +/* #undef LIBCURL_PROTOCOL_HTTPS */ /* Defined if libcurl supports IMAP */ -#define LIBCURL_PROTOCOL_IMAP 1 +/* #undef LIBCURL_PROTOCOL_IMAP */ /* Defined if libcurl supports LDAP */ -#define LIBCURL_PROTOCOL_LDAP 1 +/* #undef LIBCURL_PROTOCOL_LDAP */ /* Defined if libcurl supports POP3 */ -#define LIBCURL_PROTOCOL_POP3 1 +/* #undef LIBCURL_PROTOCOL_POP3 */ /* Defined if libcurl supports RTSP */ -#define LIBCURL_PROTOCOL_RTSP 1 +/* #undef LIBCURL_PROTOCOL_RTSP */ /* Defined if libcurl supports SMTP */ -#define LIBCURL_PROTOCOL_SMTP 1 +/* #undef LIBCURL_PROTOCOL_SMTP */ /* Defined if libcurl supports TELNET */ -#define LIBCURL_PROTOCOL_TELNET 1 +/* #undef LIBCURL_PROTOCOL_TELNET */ /* Defined if libcurl supports TFTP */ -#define LIBCURL_PROTOCOL_TFTP 1 +/* #undef LIBCURL_PROTOCOL_TFTP */ /* Define to 1 if your C compiler doesn't accept -c and -o together. */ /* #undef NO_MINUS_C_MINUS_O */ @@ -159,16 +159,16 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 1.6-git" +#define PACKAGE_STRING "ccminer 1.6" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" /* Define to the home page for this package. */ -#define PACKAGE_URL "" +#define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.6-git" +#define PACKAGE_VERSION "1.6" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -191,10 +191,10 @@ #define USE_XOP 1 /* Version number of package */ -#define VERSION "1.6-git" +#define VERSION "1.6" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ /* Define to `unsigned int' if does not define. */ -/* #undef size_t */ +//#define size_t unsigned int diff --git a/miner.h b/miner.h index 6483b15..bb660ed 100644 --- a/miner.h +++ b/miner.h @@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_zr5(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + /* api related */ void *api_thread(void *userdata); void api_set_throughput(int thr_id, uint32_t throughput); @@ -679,6 +683,7 @@ void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); void x17hash(void *output, const void *input); +void zr5hash(void *output, const void *input); #ifdef __cplusplus } diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 6381af7..ba69214 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u uint32_t hash[16]; from_bitslice_quad(state, hash); - if (thr == 0) - { + + // uint4 = 4x4 uint32_t = 16 bytes + if (thr == 0) { + uint4 *phash = (uint4*) hash; + uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */ + outpt[0] = phash[0]; + outpt[1] = phash[1]; + outpt[2] = phash[2]; + outpt[3] = phash[3]; + } +/* + if (thr == 0) { #pragma unroll for(int k=0;k<16;k++) outpHash[k] = hash[k]; } +*/ } #endif } diff --git a/util.cpp b/util.cpp index 59f5e9d..b46adaa 100644 --- a/util.cpp +++ b/util.cpp @@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash) #define printpfx(n,h) \ printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h)) +static uint32_t zrtest[20] = { + swab32(0x01806486), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x00000000), + swab32(0x2ab03251), + swab32(0x87d4f28b), + swab32(0x6e22f086), + swab32(0x4845ddd5), + swab32(0x0ac4e6aa), + swab32(0x22a1709f), + swab32(0xfb4275d9), + swab32(0x25f26636), + swab32(0x300eed54), + swab32(0xffff0f1e), + swab32(0x2a9e2300), +}; + void do_gpu_tests(void) { #ifdef _DEBUG unsigned long done; char s[128] = { '\0' }; - uchar buf[128]; + uchar buf[160]; uint32_t tgt[8] = { 0 }; opt_tracegpu = true; @@ -1674,11 +1697,15 @@ void do_gpu_tests(void) tgt[7] = 0xffff; memset(buf, 0, sizeof buf); - scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done); + //memcpy(buf, zrtest, 80); + scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done); + + //memset(buf, 0, sizeof buf); + //scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done); memset(buf, 0, sizeof buf); // buf[0] = 1; buf[64] = 2; // for endian tests - scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); + //scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); //memset(buf, 0, sizeof buf); //scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84 @@ -1688,6 +1715,7 @@ void do_gpu_tests(void) opt_tracegpu = false; #endif } +extern "C" void zr5hash_pok(void *output, uint32_t *pdata); void print_hash_tests(void) { @@ -1782,6 +1810,11 @@ void print_hash_tests(void) x17hash(&hash[0], &buf[0]); printpfx("X17", hash); + //memcpy(buf, zrtest, 80); + zr5hash(&hash[0], &buf[0]); + //zr5hash_pok(&hash[0], (uint32_t*) &buf[0]); + printpfx("ZR5", hash); + printf("\n"); do_gpu_tests(); diff --git a/zr5.cu b/zr5.cu new file mode 100644 index 0000000..4196cfb --- /dev/null +++ b/zr5.cu @@ -0,0 +1,342 @@ +/* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */ + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +} + +#include "miner.h" +#include "cuda_helper.h" + +#include +#include + +#define ZR_BLAKE 0 +#define ZR_GROESTL 1 +#define ZR_JH512 2 +#define ZR_SKEIN 3 + +#define POK_BOOL_MASK 0x00008000 +#define POK_DATA_MASK 0xFFFF0000 + +static uint32_t* d_hash[MAX_GPUS]; +static uint16_t* d_pokh[MAX_GPUS]; +static uint16_t* h_poks[MAX_GPUS]; + +static uint32_t* d_blake[MAX_GPUS]; +static uint32_t* d_groes[MAX_GPUS]; +static uint32_t* d_jh512[MAX_GPUS]; +static uint32_t* d_skein[MAX_GPUS]; + +__constant__ uint8_t d_permut[24][4]; +static const uint8_t permut[24][4] = { + {0, 1, 2, 3}, + {0, 1, 3, 2}, + {0, 2, 1, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + {1, 0, 2, 3}, + {1, 0, 3, 2}, + {1, 2, 0, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + {2, 0, 1, 3}, + {2, 0, 3, 1}, + {2, 1, 0, 3}, + {2, 1, 3, 0}, + {2, 3, 0, 1}, + {2, 3, 1, 0}, + {3, 0, 1, 2}, + {3, 0, 2, 1}, + {3, 1, 0, 2}, + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 2, 1, 0} +}; + +// CPU HASH +extern "C" void zr5hash(void *output, const void *input) +{ + sph_keccak512_context ctx_keccak; + sph_blake512_context ctx_blake; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_skein512_context ctx_skein; + + uchar _ALIGN(64) hash[64]; + uint32_t *phash = (uint32_t *) hash; + uint32_t norder; + + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, (const void*) input, 80); + sph_keccak512_close(&ctx_keccak, (void*) phash); + + norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */ + + for(int i = 0; i < 4; i++) + { + switch (permut[norder][i]) { + case ZR_BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, (const void*) phash, 64); + sph_blake512_close(&ctx_blake, phash); + break; + case ZR_GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, (const void*) phash, 64); + sph_groestl512_close(&ctx_groestl, phash); + break; + case ZR_JH512: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*) phash, 64); + sph_jh512_close(&ctx_jh, phash); + break; + case ZR_SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*) phash, 64); + sph_skein512_close(&ctx_skein, phash); + break; + default: + break; + } + } + memcpy(output, phash, 32); +} + +extern "C" void zr5hash_pok(void *output, uint32_t *pdata) +{ + const uint32_t version = pdata[0] & (~POK_DATA_MASK); + uint32_t _ALIGN(64) hash[8]; + + pdata[0] = version; + zr5hash(hash, pdata); + + // fill PoK + pdata[0] = version | (hash[0] & POK_DATA_MASK); + zr5hash(hash, pdata); + + memcpy(output, hash, 32); +} + +__global__ +void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd) +{ + // copy 64 bytes hash in the right algo buffer + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint64_t offset = thread * 64 / 4; + uint32_t *phash = &d_hash[offset]; + // algos hash order + uint32_t norder = phash[0] % ARRAY_SIZE(permut); + uint32_t algo = d_permut[norder][rnd]; + uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein }; + + if (rnd > 0) { + int algosrc = d_permut[norder][rnd - 1]; + phash = buffers[algosrc] + offset; + } + + // uint4 = 4x4 uint32_t = 16 bytes + uint4 *psrc = (uint4*) phash; + uint4 *pdst = (uint4*) (buffers[algo] + offset); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__host__ +void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + zr5_copy_round_data_gpu <<>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd); +} + +__global__ +void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh) +{ + // after the 4 algos rounds, copy back hash to d_hash + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint64_t offset = thread * 16; // 64 / 4; + uint32_t *phash = &d_hash[offset]; + uint16_t norder = phash[0] % ARRAY_SIZE(permut); + uint16_t algosrc = d_permut[norder][3]; + + uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein }; + + // copy only hash[0] + hash[6..7] + uint2 *psrc = (uint2*) (buffers[algosrc] + offset); + uint2 *pdst = (uint2*) phash; + + pdst[0].x = psrc[0].x; + pdst[3] = psrc[3]; + + //phash[7] = *(buffers[algosrc] + offset + 7); + } +} + +__host__ +void zr5_final_round(int thr_id, uint32_t threads) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + zr5_final_round_data_gpu <<>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]); +} + +extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads); +extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); + +extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks); + +extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) tmpdata[20]; + const uint32_t version = pdata[0] & (~POK_DATA_MASK); + const uint32_t first_nonce = pdata[19]; + uint32_t throughput = device_intensity(thr_id, __func__, 1U << 18); + throughput = min(throughput, (1U << 20)-1024); + throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + memcpy(tmpdata, pdata, 80); + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + // hash buffer = keccak hash 64 required + cudaMalloc(&d_hash[thr_id], 64 * throughput); + cudaMalloc(&d_pokh[thr_id], 2 * throughput); + + cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice); + cudaMallocHost(&h_poks[thr_id], 2 * throughput); + + // data buffers for the 4 rounds + cudaMalloc(&d_blake[thr_id], 64 * throughput); + cudaMalloc(&d_groes[thr_id], 64 * throughput); + cudaMalloc(&d_jh512[thr_id], 64 * throughput); + cudaMalloc(&d_skein[thr_id], 64 * throughput); + + jackpot_keccak512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + init[thr_id] = true; + } + + tmpdata[0] = version; + jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + + // Keccak512 Hash with CUDA + zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + + for (int rnd=0; rnd<4; rnd++) { + zr5_move_data_to_hash(thr_id, throughput, rnd); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++); + } + + // This generates all pok prefixes + zr5_final_round(thr_id, throughput); + + // Keccak512 pok + zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]); + + for (int rnd=0; rnd<4; rnd++) { + zr5_move_data_to_hash(thr_id, throughput, rnd); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++); + } + zr5_final_round(thr_id, throughput); + + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (foundNonce != UINT32_MAX) + { + uint32_t vhash64[8]; + uint32_t oldp0 = pdata[0]; + uint32_t oldp19 = pdata[19]; + uint32_t offset = foundNonce - pdata[19]; + uint32_t pok = 0; + + *hashes_done = pdata[19] - first_nonce + throughput; + + cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost); + pok = version | (0x10000UL * h_poks[thr_id][offset]); + pdata[0] = pok; pdata[19] = foundNonce; + zr5hash(vhash64, pdata); + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + int res = 1; + uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + if (secNonce != 0) { + offset = secNonce - oldp19; + pok = version | (0x10000UL * h_poks[thr_id][offset]); + memcpy(tmpdata, pdata, 80); + tmpdata[0] = pok; tmpdata[19] = secNonce; + zr5hash(vhash64, tmpdata); + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + pdata[21] = secNonce; + pdata[22] = pok; + res++; + } + } + return res; + } else { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + pdata[19]++; + pdata[0] = oldp0; + } + } else + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +}