From 22c28ccbefc80da2fb47ec005790df639f658d07 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 21 Apr 2015 17:43:12 +0200 Subject: [PATCH] scrypt: strip keccak/blake 256 remains --- scrypt-jane.cpp | 17 +- scrypt.cpp | 4 +- scrypt/keccak.cu | 367 +------------- scrypt/nv_kernel.cu | 781 ---------------------------- scrypt/nv_kernel.h | 6 - scrypt/nv_kernel2.cu | 1091 ---------------------------------------- scrypt/nv_kernel2.h | 6 - scrypt/salsa_kernel.cu | 38 -- scrypt/salsa_kernel.h | 28 -- 9 files changed, 15 insertions(+), 2323 deletions(-) diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index c6cc79d..b435208 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -240,13 +240,12 @@ static void scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) * - mikaelh */ static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, - const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) + const uint8_t *salt, size_t salt_len, uint8_t *out, uint64_t bytes) { scrypt_hmac_state hmac_pw, hmac_pw_salt, work; scrypt_hash_digest ti, u; uint8_t be[4]; - uint32_t i, /*j,*/ blocks; -// uint64_t c; + uint32_t i, blocks; /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ @@ -266,7 +265,7 @@ static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, scrypt_hmac_finish(&work, ti); memcpy(u, ti, sizeof(u)); - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + memcpy(out, ti, (size_t) (bytes > SCRYPT_HASH_DIGEST_SIZE ? SCRYPT_HASH_DIGEST_SIZE : bytes)); out += SCRYPT_HASH_DIGEST_SIZE; bytes -= SCRYPT_HASH_DIGEST_SIZE; } @@ -631,7 +630,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, const uchar*salt, size_t salt_len, uint32_t N, - uchar *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) + uchar *out, uint32_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) { uint32_t chunk_bytes, i; const uint32_t p = SCRYPT_P; @@ -650,7 +649,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N); /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes); + scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, (size_t) bytes); #ifdef SCRYPT_PREVENT_STATE_LEAK /* This is an unnecessary security feature - mikaelh */ @@ -661,7 +660,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con /* for cpu hash test */ void scryptjane_hash(void* output, const void* input) { - uint64_t Nsize = 1ULL << (opt_nfactor + 1); + uint32_t Nsize = 1UL << (opt_nfactor + 1); uint64_t chunk_bytes; uint8_t *X, *Y; scrypt_aligned_alloc YX, V; @@ -670,12 +669,12 @@ void scryptjane_hash(void* output, const void* input) V = scrypt_alloc(Nsize * chunk_bytes); YX = scrypt_alloc((SCRYPT_P + 1) * chunk_bytes); - memset(V.ptr, 0, Nsize * chunk_bytes); + memset(V.ptr, 0, (size_t) (Nsize * chunk_bytes)); Y = YX.ptr; X = Y + chunk_bytes; - scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, Nsize, (uchar*)output, 32, X, Y, V.ptr); + scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, (uint32_t) Nsize, (uchar*)output, 32, X, Y, V.ptr); scrypt_free(&V); scrypt_free(&YX); diff --git a/scrypt.cpp b/scrypt.cpp index e9be83e..86313d5 100644 --- a/scrypt.cpp +++ b/scrypt.cpp @@ -994,12 +994,12 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C) */ static void scrypt_core(uint32_t *X, uint32_t *V, uint32_t N) { - for (int i = 0; i < N; i++) { + for (uint32_t i = 0; i < N; i++) { memcpy(&V[i * 32], X, 128); xor_salsa8(&X[0], &X[16]); xor_salsa8(&X[16], &X[0]); } - for (int i = 0; i < N; i++) { + for (uint32_t i = 0; i < N; i++) { uint32_t j = 32 * (X[16] & (N - 1)); for (uint8_t k = 0; k < 32; k++) X[k] ^= V[j + k]; diff --git a/scrypt/keccak.cu b/scrypt/keccak.cu index 018a969..142d624 100644 --- a/scrypt/keccak.cu +++ b/scrypt/keccak.cu @@ -4,21 +4,16 @@ // The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins // in place of the SHA2 based PBKDF2 used in scrypt coins. // -// The keccak256 is used exclusively in Maxcoin and clones. This module -// holds the generic "default" implementation when no architecture -// specific implementation is available in the kernel. -// -// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 +// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=64 // #include -#include -#include "salsa_kernel.h" -#include "cuda_runtime.h" #include "miner.h" +#include "cuda_helper.h" #include "keccak.h" +#include "salsa_kernel.h" // define some error checking macros #undef checkCudaErrors @@ -45,7 +40,9 @@ extern std::map context_odata[2]; extern std::map context_streams[2]; extern std::map context_hash[2]; +#ifndef ROTL64 #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) +#endif // CB #define U32TO64_LE(p) \ @@ -375,11 +372,6 @@ __device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) { // ---------------------------- END PBKDF2 functions ------------------------------------ -static __device__ uint32_t cuda_swab32(uint32_t x) { - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - __global__ __launch_bounds__(128) void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) { @@ -486,352 +478,3 @@ extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throu cuda_post_keccak512<<>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce); } - - -// -// Maxcoin related Keccak implementation (Keccak256) -// - -#include - -#include -extern std::map context_blocks; -extern std::map context_wpb; -extern std::map context_kernel; - -__constant__ uint64_t ptarget64[4]; - -#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) -#define ROL_mult8(a, offset) ROL(a, offset) - -__constant__ uint64_t KeccakF_RoundConstants[24]; - -static uint64_t host_KeccakF_RoundConstants[24] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -__constant__ uint64_t pdata64[10]; - -__global__ -void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate) -{ - uint64_t Aba, Abe, Abi, Abo, Abu; - uint64_t Aga, Age, Agi, Ago, Agu; - uint64_t Aka, Ake, Aki, Ako, Aku; - uint64_t Ama, Ame, Ami, Amo, Amu; - uint64_t Asa, Ase, Asi, Aso, Asu; - uint64_t BCa, BCe, BCi, BCo, BCu; - uint64_t Da, De, Di, Do, Du; - uint64_t Eba, Ebe, Ebi, Ebo, Ebu; - uint64_t Ega, Ege, Egi, Ego, Egu; - uint64_t Eka, Eke, Eki, Eko, Eku; - uint64_t Ema, Eme, Emi, Emo, Emu; - uint64_t Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = pdata64[0]; - Abe = pdata64[1]; - Abi = pdata64[2]; - Abo = pdata64[3]; - Abu = pdata64[4]; - Aga = pdata64[5]; - Age = pdata64[6]; - Agi = pdata64[7]; - Ago = pdata64[8]; - Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32); - Aka = 0x0000000000000001ULL; - Ake = 0; - Aki = 0; - Ako = 0; - Aku = 0; - Ama = 0; - Ame = 0x8000000000000000ULL; - Ami = 0; - Amo = 0; - Amu = 0; - Asa = 0; - Ase = 0; - Asi = 0; - Aso = 0; - Asu = 0; - -#pragma unroll 12 - for( int laneCount = 0; laneCount < 24; laneCount += 2 ) - { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL_mult8(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL_mult8(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL_mult8(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL_mult8(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - if (validate) { - g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); - g_out[3] = Abo; - g_out[2] = Abi; - g_out[1] = Abe; - g_out[0] = Aba; - } - - // the likelyhood of meeting the hashing target is so low, that we're not guarding this - // with atomic writes, locks or similar... - uint64_t *g_good64 = (uint64_t*)g_good; - if (Abo <= ptarget64[3]) { - if (Abo < g_good64[3]) { - g_good64[3] = Abo; - g_good64[2] = Abi; - g_good64[1] = Abe; - g_good64[0] = Aba; - g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - } - } -} - -static std::map context_good[2]; - -bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) -{ - static bool init[MAX_GPUS] = { 0 }; - - if (!init[thr_id]) - { - checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); - - // allocate pinned host memory for good hashes - uint32_t *tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; - - init[thr_id] = true; - } - checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - - return context_good[0][thr_id] && context_good[1][thr_id]; -} - -void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); - - crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); - - // copy hashes from device memory to host (ALL hashes, lots of data...) - if (do_d2h && hash != NULL) { - size_t mem_size = throughput * sizeof(uint32_t) * 8; - checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } - else if (hash != NULL) { - // asynchronous copy of winning nonce (just 4 bytes...) - checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } -} diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu index b84a557..9a79eb9 100644 --- a/scrypt/nv_kernel.cu +++ b/scrypt/nv_kernel.cu @@ -708,784 +708,3 @@ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned i __transposed_write_BC(B, C, (uint4*)(g_odata), 1); } - - - -// -// Maxcoin related Keccak implementation (Keccak256) -// - -// from salsa_kernel.cu -extern std::map context_blocks; -extern std::map context_wpb; -extern std::map context_kernel; -extern std::map context_streams[2]; -extern std::map context_hash[2]; - -__constant__ uint64_t ptarget64[4]; - -#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) -#define ROL_mult8(a, offset) ROL(a, offset) - -__constant__ uint64_t KeccakF_RoundConstants[24]; -static uint64_t host_KeccakF_RoundConstants[24] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -__constant__ uint64_t pdata64[10]; - -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - -__global__ -void kepler_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) -{ - uint64_t Aba, Abe, Abi, Abo, Abu; - uint64_t Aga, Age, Agi, Ago, Agu; - uint64_t Aka, Ake, Aki, Ako, Aku; - uint64_t Ama, Ame, Ami, Amo, Amu; - uint64_t Asa, Ase, Asi, Aso, Asu; - uint64_t BCa, BCe, BCi, BCo, BCu; - uint64_t Da, De, Di, Do, Du; - uint64_t Eba, Ebe, Ebi, Ebo, Ebu; - uint64_t Ega, Ege, Egi, Ego, Egu; - uint64_t Eka, Eke, Eki, Eko, Eku; - uint64_t Ema, Eme, Emi, Emo, Emu; - uint64_t Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = pdata64[0]; - Abe = pdata64[1]; - Abi = pdata64[2]; - Abo = pdata64[3]; - Abu = pdata64[4]; - Aga = pdata64[5]; - Age = pdata64[6]; - Agi = pdata64[7]; - Ago = pdata64[8]; - Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32); - Aka = 0x0000000000000001ULL; - Ake = 0; - Aki = 0; - Ako = 0; - Aku = 0; - Ama = 0; - Ame = 0x8000000000000000ULL; - Ami = 0; - Amo = 0; - Amu = 0; - Asa = 0; - Ase = 0; - Asi = 0; - Aso = 0; - Asu = 0; - -#pragma unroll 12 - for( int laneCount = 0; laneCount < 24; laneCount += 2 ) - { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL_mult8(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL_mult8(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL_mult8(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL_mult8(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - if (validate) { - g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); - g_out[3] = Abo; - g_out[2] = Abi; - g_out[1] = Abe; - g_out[0] = Aba; - } - - // the likelyhood of meeting the hashing target is so low, that we're not guarding this - // with atomic writes, locks or similar... - uint64_t *g_good64 = (uint64_t*)g_good; - if (Abo <= ptarget64[3]) { - if (Abo < g_good64[3]) { - g_good64[3] = Abo; - g_good64[2] = Abi; - g_good64[1] = Abe; - g_good64[0] = Aba; - g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - } - } -} - -static std::map context_good[2]; - -bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) -{ - static bool init[MAX_GPUS] = { 0 }; - - if (!init[thr_id]) - { - checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); - - // allocate pinned host memory for good hashes - uint32_t *tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; - - init[thr_id] = true; - } - checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - - return context_good[0][thr_id] && context_good[1][thr_id]; -} - -void NVKernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); - - kepler_crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); - - // copy hashes from device memory to host (ALL hashes, lots of data...) - if (do_d2h && hash != NULL) { - size_t mem_size = throughput * sizeof(uint32_t) * 8; - checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } - else if (hash != NULL) { - // asynchronous copy of winning nonce (just 4 bytes...) - checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } -} - - -// -// Blakecoin related Keccak implementation (Keccak256) -// - -typedef uint32_t sph_u32; -#define SPH_C32(x) ((sph_u32)(x)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b))) -#else - // Kepler (Compute 3.5) - #define SPH_ROTL32(a, b) __funnelshift_l( a, a, b ); -#endif -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -__constant__ uint32_t pdata[20]; - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static __device__ void -cuda_sph_enc32be(void *dst, sph_u32 val) -{ - *(sph_u32 *)dst = cuda_sph_bswap32(val); -} - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - -#define COMPRESS32 do { \ - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CS0; \ - V9 = S1 ^ CS1; \ - VA = S2 ^ CS2; \ - VB = S3 ^ CS3; \ - VC = T0 ^ CS4; \ - VD = T0 ^ CS5; \ - VE = T1 ^ CS6; \ - VF = T1 ^ CS7; \ - M0 = input[0]; \ - M1 = input[1]; \ - M2 = input[2]; \ - M3 = input[3]; \ - M4 = input[4]; \ - M5 = input[5]; \ - M6 = input[6]; \ - M7 = input[7]; \ - M8 = input[8]; \ - M9 = input[9]; \ - MA = input[10]; \ - MB = input[11]; \ - MC = input[12]; \ - MD = input[13]; \ - ME = input[14]; \ - MF = input[15]; \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - ROUND_S(4); \ - ROUND_S(5); \ - ROUND_S(6); \ - ROUND_S(7); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - - -__global__ -void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate) -{ - uint32_t input[16]; - uint64_t output[4]; - - #pragma unroll - for (int i=0; i < 16; ++i) input[i] = pdata[i]; - - sph_u32 H0 = 0x6A09E667; - sph_u32 H1 = 0xBB67AE85; - sph_u32 H2 = 0x3C6EF372; - sph_u32 H3 = 0xA54FF53A; - sph_u32 H4 = 0x510E527F; - sph_u32 H5 = 0x9B05688C; - sph_u32 H6 = 0x1F83D9AB; - sph_u32 H7 = 0x5BE0CD19; - sph_u32 S0 = 0; - sph_u32 S1 = 0; - sph_u32 S2 = 0; - sph_u32 S3 = 0; - sph_u32 T0 = 0; - sph_u32 T1 = 0; - T0 = SPH_T32(T0 + 512); - COMPRESS32; - - #pragma unroll - for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; - - input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - input[4] = 0x80000000; - - #pragma unroll 8 - for (int i=5; i < 13; ++i) input[i] = 0; - - input[13] = 0x00000001; - input[14] = T1; - input[15] = T0 + 128; - - T0 = SPH_T32(T0 + 128); - COMPRESS32; - - cuda_sph_enc32be((unsigned char*)output + 4*6, H6); - cuda_sph_enc32be((unsigned char*)output + 4*7, H7); - if (validate || output[3] <= ptarget64[3]) - { - // this data is only needed when we actually need to save the hashes - cuda_sph_enc32be((unsigned char*)output + 4*0, H0); - cuda_sph_enc32be((unsigned char*)output + 4*1, H1); - cuda_sph_enc32be((unsigned char*)output + 4*2, H2); - cuda_sph_enc32be((unsigned char*)output + 4*3, H3); - cuda_sph_enc32be((unsigned char*)output + 4*4, H4); - cuda_sph_enc32be((unsigned char*)output + 4*5, H5); - } - - if (validate) - { - g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); - #pragma unroll - for (int i=0; i < 4; ++i) g_out[i] = output[i]; - } - - if (output[3] <= ptarget64[3]) { - uint64_t *g_good64 = (uint64_t*)g_good; - if (output[3] < g_good64[3]) { - g_good64[3] = output[3]; - g_good64[2] = output[2]; - g_good64[1] = output[1]; - g_good64[0] = output[0]; - g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - } - } -} - -bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) -{ - static bool init[MAX_GPUS] = { 0 }; - - if (!init[thr_id]) - { - // allocate pinned host memory for good hashes - uint32_t *tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; - - init[thr_id] = true; - } - checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - - return context_good[0][thr_id] && context_good[1][thr_id]; -} - -void NVKernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); - - kepler_blake256_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); - - // copy hashes from device memory to host (ALL hashes, lots of data...) - if (do_d2h && hash != NULL) { - size_t mem_size = throughput * sizeof(uint32_t) * 8; - checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } - else if (hash != NULL) { - // asynchronous copy of winning nonce (just 4 bytes...) - checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } -} diff --git a/scrypt/nv_kernel.h b/scrypt/nv_kernel.h index e45ed9b..fa839cf 100644 --- a/scrypt/nv_kernel.h +++ b/scrypt/nv_kernel.h @@ -25,12 +25,6 @@ public: virtual bool support_lookup_gap() { return true; } virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } - - virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); - virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); - - virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); - virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); }; #endif // #ifndef NV_KERNEL_H diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu index f832dbf..c581eda 100644 --- a/scrypt/nv_kernel2.cu +++ b/scrypt/nv_kernel2.cu @@ -633,1094 +633,3 @@ template __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata __transposed_write_BC(B, C, (uint4*)(g_odata), 1); } - - -// -// Maxcoin related Keccak implementation (Keccak256) -// - -// from salsa_kernel.cu -extern std::map context_blocks; -extern std::map context_wpb; -extern std::map context_kernel; -extern std::map context_streams[2]; -extern std::map context_hash[2]; - -__constant__ uint64_t ptarget64[4]; - -// ROL macro replaced with the inline assembly code below to work around a performance issue -//#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) -__inline__ __device__ uint2 ROL(const uint2 a, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); - } - return result; -} -#define ROL_mult8(a, offset) ROL(a, offset) - -__inline__ __device__ uint64_t devectorize(uint2 v) { return __double_as_longlong(__hiloint2double(v.y, v.x)); } -__inline__ __device__ uint2 vectorize(uint64_t v) { return make_uint2(__double2loint(__longlong_as_double(v)), __double2hiint(__longlong_as_double(v))); } -__inline__ __device__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } -__inline__ __device__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } -__inline__ __device__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } -__inline__ __device__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } -__inline__ __device__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } - -__constant__ uint64_t KeccakF_RoundConstants[24]; - -static uint64_t host_KeccakF_RoundConstants[24] = -{ - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -__constant__ uint64_t pdata64[10]; - -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - -// in this implementation the first and last iteration of the for() loop were explicitly -// unrolled and redundant operations were removed (e.g. operations on zero inputs, and -// computation of unnecessary outputs) -__global__ void titan_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) -{ - uint2 Aba, Abe, Abi, Abo, Abu; - uint2 Aga, Age, Agi, Ago, Agu; - uint2 Aka, Ake, Aki, Ako, Aku; - uint2 Ama, Ame, Ami, Amo, Amu; - uint2 Asa, Ase, Asi, Aso, Asu; - uint2 BCa, BCe, BCi, BCo, BCu; - uint2 Da, De, Di, Do, Du; - uint2 Eba, Ebe, Ebi, Ebo, Ebu; - uint2 Ega, Ege, Egi, Ego, Egu; - uint2 Eka, Eke, Eki, Eko, Eku; - uint2 Ema, Eme, Emi, Emo, Emu; - uint2 Esa, Ese, Esi, Eso, Esu; - - // embed unique nonce into source data stream in pdata[] - Agu = vectorize((pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32)); - - // prepareTheta - BCa = vectorize(pdata64[0]^pdata64[5]^0x0000000000000001ULL); - BCe = vectorize(pdata64[1]^pdata64[6]^0x8000000000000000ULL); - BCi = vectorize(pdata64[2]^pdata64[7]); - BCo = vectorize(pdata64[3]^pdata64[8]); - BCu = vectorize(pdata64[4])^Agu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba = vectorize(pdata64[0]) ^ Da; - BCa = Aba; - Age = vectorize(pdata64[6]) ^ De; - BCe = ROL(Age, 44); - Aki = Di; - BCi = ROL(Aki, 43); - Amo = Do; - BCo = ROL(Amo, 21); - Asu = Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[0]); - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo = vectorize(pdata64[3]) ^ Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka = vectorize(0x0000000000000001ULL) ^ Da; - BCi = ROL(Aka, 3); - Ame = vectorize(0x8000000000000000ULL) ^ De; - BCo = ROL(Ame, 45); - Asi = Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe = vectorize(pdata64[1]) ^ De; - BCa = ROL(Abe, 1); - Agi = vectorize(pdata64[7]) ^ Di; - BCe = ROL(Agi, 6); - Ako = Do; - BCi = ROL(Ako, 25); - Amu = Du; - BCo = ROL(Amu, 8); - Asa = Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu = vectorize(pdata64[4]) ^ Du; - BCa = ROL(Abu, 27); - Aga = vectorize(pdata64[5]) ^ Da; - BCe = ROL(Aga, 36); - Ake = De; - BCi = ROL(Ake, 10); - Ami = Di; - BCo = ROL(Ami, 15); - Aso = Do; - BCu = ROL(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi = vectorize(pdata64[2]) ^ Di; - BCa = ROL(Abi, 62); - Ago = vectorize(pdata64[8]) ^ Do; - BCe = ROL(Ago, 55); - Aku = Du; - BCi = ROL(Aku, 39); - Ama = Da; - BCo = ROL(Ama, 41); - Ase = De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[1]); - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - -//#pragma unroll 10 - for( int laneCount = 2; laneCount < 22; laneCount += 2 ) - { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount]); - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount+1]); - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[22]); - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[23]); - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - - if (validate) { - g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); - g_out[3] = devectorize(Abo); - g_out[2] = devectorize(Abi); - g_out[1] = devectorize(Abe); - g_out[0] = devectorize(Aba); - } - - // the likelyhood of meeting the hashing target is so low, that we're not guarding this - // with atomic writes, locks or similar... - uint64_t *g_good64 = (uint64_t*)g_good; - if (devectorize(Abo) <= ptarget64[3]) { - if (devectorize(Abo) < g_good64[3]) { - g_good64[3] = devectorize(Abo); - g_good64[2] = devectorize(Abi); - g_good64[1] = devectorize(Abe); - g_good64[0] = devectorize(Aba); - g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - } - } -} - -static std::map context_good[2]; - -bool NV2Kernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) -{ - static bool init[MAX_GPUS] = { 0 }; - - if (!init[thr_id]) - { - checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); - - // allocate pinned host memory for good hashes - uint32_t *tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; - - init[thr_id] = true; - } - checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - - return context_good[0][thr_id] && context_good[1][thr_id]; -} - -void NV2Kernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); - - titan_crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); - - // copy hashes from device memory to host (ALL hashes, lots of data...) - if (do_d2h && hash != NULL) { - size_t mem_size = throughput * sizeof(uint32_t) * 8; - checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } - else if (hash != NULL) { - // asynchronous copy of winning nonce (just 4 bytes...) - checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } -} - - -// -// Blakecoin related Keccak implementation (Keccak256) -// - -typedef uint32_t sph_u32; -#define SPH_C32(x) ((sph_u32)(x)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b))) -#else - // Kepler (Compute 3.5) - #define SPH_ROTL32(a, b) __funnelshift_l( a, a, b ); -#endif -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -__constant__ uint32_t pdata[20]; - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static __device__ void -cuda_sph_enc32be(void *dst, sph_u32 val) -{ - *(sph_u32 *)dst = cuda_sph_bswap32(val); -} - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - -#define COMPRESS32 do { \ - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CS0; \ - V9 = S1 ^ CS1; \ - VA = S2 ^ CS2; \ - VB = S3 ^ CS3; \ - VC = T0 ^ CS4; \ - VD = T0 ^ CS5; \ - VE = T1 ^ CS6; \ - VF = T1 ^ CS7; \ - M0 = input[0]; \ - M1 = input[1]; \ - M2 = input[2]; \ - M3 = input[3]; \ - M4 = input[4]; \ - M5 = input[5]; \ - M6 = input[6]; \ - M7 = input[7]; \ - M8 = input[8]; \ - M9 = input[9]; \ - MA = input[10]; \ - MB = input[11]; \ - MC = input[12]; \ - MD = input[13]; \ - ME = input[14]; \ - MF = input[15]; \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - ROUND_S(4); \ - ROUND_S(5); \ - ROUND_S(6); \ - ROUND_S(7); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - - -__global__ void titan_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) -{ - uint32_t input[16]; - uint64_t output[4]; - -#pragma unroll 16 - for (int i=0; i < 16; ++i) input[i] = pdata[i]; - - sph_u32 H0 = 0x6A09E667; - sph_u32 H1 = 0xBB67AE85; - sph_u32 H2 = 0x3C6EF372; - sph_u32 H3 = 0xA54FF53A; - sph_u32 H4 = 0x510E527F; - sph_u32 H5 = 0x9B05688C; - sph_u32 H6 = 0x1F83D9AB; - sph_u32 H7 = 0x5BE0CD19; - sph_u32 S0 = 0; - sph_u32 S1 = 0; - sph_u32 S2 = 0; - sph_u32 S3 = 0; - sph_u32 T0 = 0; - sph_u32 T1 = 0; - T0 = SPH_T32(T0 + 512); - COMPRESS32; - -#pragma unroll 3 - for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; - input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - input[4] = 0x80000000; -#pragma unroll 8 - for (int i=5; i < 13; ++i) input[i] = 0; - input[13] = 0x00000001; - input[14] = T1; - input[15] = T0 + 128; - - T0 = SPH_T32(T0 + 128); - COMPRESS32; - - cuda_sph_enc32be((unsigned char*)output + 4*6, H6); - cuda_sph_enc32be((unsigned char*)output + 4*7, H7); - if (validate || output[3] <= ptarget64[3]) - { - // this data is only needed when we actually need to save the hashes - cuda_sph_enc32be((unsigned char*)output + 4*0, H0); - cuda_sph_enc32be((unsigned char*)output + 4*1, H1); - cuda_sph_enc32be((unsigned char*)output + 4*2, H2); - cuda_sph_enc32be((unsigned char*)output + 4*3, H3); - cuda_sph_enc32be((unsigned char*)output + 4*4, H4); - cuda_sph_enc32be((unsigned char*)output + 4*5, H5); - } - - if (validate) - { - g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); -#pragma unroll 4 - for (int i=0; i < 4; ++i) g_out[i] = output[i]; - } - - if (output[3] <= ptarget64[3]) { - uint64_t *g_good64 = (uint64_t*)g_good; - if (output[3] < g_good64[3]) { - g_good64[3] = output[3]; - g_good64[2] = output[2]; - g_good64[1] = output[1]; - g_good64[0] = output[0]; - g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); - } - } -} - -bool NV2Kernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) -{ - static bool init[MAX_GPUS] = { 0 }; - - if (!init[thr_id]) - { - // allocate pinned host memory for good hashes - uint32_t *tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; - checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; - - init[thr_id] = true; - } - checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - - return context_good[0][thr_id] && context_good[1][thr_id]; -} - -void NV2Kernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); - - titan_blake256_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); - - // copy hashes from device memory to host (ALL hashes, lots of data...) - if (do_d2h && hash != NULL) { - size_t mem_size = throughput * sizeof(uint32_t) * 8; - checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } - else if (hash != NULL) { - // asynchronous copy of winning nonce (just 4 bytes...) - checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), - cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); - } -} diff --git a/scrypt/nv_kernel2.h b/scrypt/nv_kernel2.h index a67c65f..79e1b1c 100644 --- a/scrypt/nv_kernel2.h +++ b/scrypt/nv_kernel2.h @@ -25,12 +25,6 @@ public: virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } - - virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); - virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); - - virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); - virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); }; #endif // #ifndef NV2_KERNEL_H diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu index 869d7f5..ee0de00 100644 --- a/scrypt/salsa_kernel.cu +++ b/scrypt/salsa_kernel.cu @@ -821,44 +821,6 @@ void cuda_scrypt_core(int thr_id, int stream, unsigned int N) ); } -bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) -{ - return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget); -} -#if 0 -void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - unsigned int GRID_BLOCKS = context_blocks[thr_id]; - unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; - unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); - - // setup execution parameters - dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); - dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); - - context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); -} -#endif -bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) -{ - return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget); -} - -#if 0 -void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) -{ - unsigned int GRID_BLOCKS = context_blocks[thr_id]; - unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; - unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); - - // setup execution parameters - dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); - dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); - - context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); -} -#endif - void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA) { unsigned int GRID_BLOCKS = context_blocks[thr_id]; diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h index ccb60ab..7b1ab27 100644 --- a/scrypt/salsa_kernel.h +++ b/scrypt/salsa_kernel.h @@ -58,20 +58,6 @@ extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA); extern bool cuda_scrypt_sync(int thr_id, int stream); extern void cuda_scrypt_flush(int thr_id, int stream); -extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); -extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); - -extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); -extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); - -extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); -extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); - -#ifdef __NVCC__ -extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); -extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); -#endif - // If we're in C++ mode, we're either compiling .cu files or scrypt.cpp #ifdef __NVCC__ @@ -101,20 +87,6 @@ public: virtual bool support_lookup_gap() { return false; } virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; } virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; } - - virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { - return default_prepare_keccak256(thr_id, host_pdata, ptarget); - } - virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { - default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); - } - - virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { - return default_prepare_blake256(thr_id, host_pdata, ptarget); - } - virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { - default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); - } }; // Not performing error checking is actually bad, but...