From d43dc9a02169a06d22f0b8c0362eb69511dbb059 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 24 Oct 2015 10:35:38 +0200 Subject: [PATCH] use blake512 sp kernels on SM 5+ (80+64) import and keep my code for older archs, like skein 64 reduce the gap between our versions... +150kH x11 GTX 960 / +30kH 750Ti +900kH quark GTX 960 / +230kH 750Ti --- JHA/jackpotcoin.cu | 2 + ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + cuda_nist5.cu | 2 + pentablake.cu | 9 +- quark/cuda_quark_blake512.cu | 122 ++++-- quark/cuda_quark_blake512_sp.cuh | 682 +++++++++++++++++++++++++++++++ quark/quarkcoin.cu | 192 +++++---- x11/c11.cu | 2 + x11/x11.cu | 2 + x13/x13.cu | 2 + x15/x14.cu | 2 + x15/x15.cu | 2 + x17/x17.cu | 2 + 14 files changed, 898 insertions(+), 127 deletions(-) create mode 100644 quark/cuda_quark_blake512_sp.cuh diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 642b749..68e2093 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -23,6 +23,7 @@ extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); @@ -279,6 +280,7 @@ extern "C" void free_jackpot(int thr_id) cudaFree(d_branch3Nonces[thr_id]); cudaFree(d_jackpotNonces[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); jackpot_compactTest_cpu_free(thr_id); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 0ff3400..c0588b1 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -314,6 +314,7 @@ + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 80fd763..b123622 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -428,6 +428,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\quark + Source Files\CUDA\quark diff --git a/cuda_nist5.cu b/cuda_nist5.cu index ef875d6..a21079b 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -14,6 +14,7 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -175,6 +176,7 @@ extern "C" void free_nist5(int thr_id) cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/pentablake.cu b/pentablake.cu index 062b270..7d4af2b 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -52,7 +52,7 @@ static uint64_t __align__(32) c_data[32]; static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNounce[MAX_GPUS]; static uint32_t *h_resNounce[MAX_GPUS]; -static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; +static __thread uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; /* prefer uint32_t to prevent size conversions = speed +5/10 % */ __constant__ @@ -375,16 +375,13 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000F; + ptarget[7] = 0x000F; if (!init[thr_id]) { - if (active_gpus > 1) { - cudaSetDevice(device_map[thr_id]); - } + cudaSetDevice(device_map[thr_id]); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t))); - init[thr_id] = true; } diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 1cb88ae..a7c4376 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -2,6 +2,7 @@ #include #include // off_t +#include "miner.h" #include "cuda_helper.h" #define ROTR(x,n) ROTR64(x,n) @@ -14,23 +15,26 @@ static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ __device__ __constant__ -static const uint8_t c_sigma[16][16] = { +static const uint8_t c_sigma_big[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }}; + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; __device__ __constant__ static const uint64_t c_u512[16] = @@ -59,7 +63,7 @@ static const uint64_t c_u512[16] = } __device__ __forceinline__ -void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) +void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) { uint64_t v[16]; uint64_t m[16]; @@ -157,7 +161,7 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t buf[i+8] = d_constHashPadding[i]; // Ending round - quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); + quark_blake512_compress(h, buf, c_sigma_big, c_u512, 512); #if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U]; @@ -201,7 +205,7 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou 0x5be0cd19137e2179ULL }; - quark_blake512_compress(h, buf, c_sigma, c_u512, 640); + quark_blake512_compress(h, buf, c_sigma_big, c_u512, 640); #if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U); @@ -219,52 +223,106 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou } } +#define SP_KERNEL +#ifdef SP_KERNEL +void quark_blake512_cpu_setBlock_80_sp(uint64_t*); +void quark_blake512_cpu_init_sp(int thr_id); +void quark_blake512_cpu_free_sp(int thr_id); +#endif -// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ +#include "cuda_quark_blake512_sp.cuh" __host__ -void quark_blake512_cpu_init(int thr_id, uint32_t threads) +void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order) +{ +#ifdef SP_KERNEL + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash); + else +#endif + { + const uint32_t threadsperblock = 256; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); + } + MyStreamSynchronize(NULL, order, thr_id); +} + +__host__ +void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { -// CUDA_SAFE_CALL(cudaGetLastError()); +#ifdef SP_KERNEL + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash); + else +#endif + { + const uint32_t threadsperblock = 256; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + } } +// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ + __host__ -void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata) +void quark_blake512_cpu_init(int thr_id, uint32_t threads) { - uint64_t message[16]; - - memcpy(message, pdata, 80); - message[10] = 0x80; - message[11] = 0; - message[12] = 0; - message[13] = 0x0100000000000000ull; - message[14] = 0; - message[15] = 0x8002000000000000ull; // 0x280 - - cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice); - CUDA_SAFE_CALL(cudaGetLastError()); + cuda_get_arch(thr_id); +#ifdef SP_KERNEL + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + quark_blake512_cpu_init_sp(thr_id); +#endif } __host__ -void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order) +void quark_blake512_cpu_free(int thr_id) { - const uint32_t threadsperblock = 256; +#ifdef SP_KERNEL + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + quark_blake512_cpu_free_sp(thr_id); +#endif +} - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); +// ----------------------------- Host midstate for 80-bytes input ------------------------------------ - quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); +#undef SPH_C32 +#undef SPH_T32 +#undef SPH_C64 +#undef SPH_T64 - //MyStreamSynchronize(NULL, order, thr_id); +extern "C" { +#include "sph/sph_blake.h" } __host__ -void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata) { - const uint32_t threadsperblock = 256; +#ifdef SP_KERNEL + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + quark_blake512_cpu_setBlock_80_sp((uint64_t*) endiandata); + else +#endif + { + uint64_t message[16]; - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + memcpy(message, endiandata, 80); + message[10] = 0x80; + message[11] = 0; + message[12] = 0; + message[13] = 0x0100000000000000ull; + message[14] = 0; + message[15] = 0x8002000000000000ull; // 0x280 - quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice); + } + CUDA_LOG_ERROR(); } diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh new file mode 100644 index 0000000..19e4fd3 --- /dev/null +++ b/quark/cuda_quark_blake512_sp.cuh @@ -0,0 +1,682 @@ +/* sp implementation of blake */ + +//#include +//#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500 + +#include "cuda_vector_uint2x4.h" + +#undef G +#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0) +#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v) + +static __device__ __forceinline__ uint2 cuda_swap(uint2 v) { + const uint32_t t = cuda_swab32(v.x); + v.x = cuda_swab32(v.y); + v.y = t; + return v; +} +static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) { + uint2 result; + result.y = u.x ^ v.x; + result.x = u.y ^ v.y; + return result; +} + +static uint2* d_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding) +__constant__ uint2 c_PaddedM[16]; +__constant__ uint2x4 Hostprecalc[4]; + +__constant__ uint2 c_512_u2[16] = +{ + { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, + { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, + { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, + { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, + { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, + { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, + { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, + { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } +}; + +__constant__ uint8_t c_sigma[6][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ + +#define Gprecalc(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \ + v[d] = eorswap32( v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \ + v[d] = ROR16(v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 11); \ +} + +#define GprecalcHost(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = ROTR64( v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 11); \ +} + +#define G(a,b,c,d,x) { \ + uint32_t idx1 = c_sigma[i][x]; \ + uint32_t idx2 = c_sigma[i][x+1]; \ + v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \ + v[d] = eorswap32(v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2( v[b] ^ v[c], 25); \ + v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \ + v[d] = ROR16( v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2( v[b] ^ v[c], 11); \ +} + +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(256, 1) +#endif +void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + + uint2 msg[16]; + + uint2x4 *phash = (uint2x4*)&g_hash[hashPosition * 8U]; + uint2x4 *outpt = (uint2x4*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + uint2 block[16]; + block[0].x = cuda_swab32(msg[0].y); + block[0].y = cuda_swab32(msg[0].x); + block[1].x = cuda_swab32(msg[1].y); + block[1].y = cuda_swab32(msg[1].x); + block[2].x = cuda_swab32(msg[2].y); + block[2].y = cuda_swab32(msg[2].x); + block[3].x = cuda_swab32(msg[3].y); + block[3].y = cuda_swab32(msg[3].x); + block[4].x = cuda_swab32(msg[4].y); + block[4].y = cuda_swab32(msg[4].x); + block[5].x = cuda_swab32(msg[5].y); + block[5].y = cuda_swab32(msg[5].x); + block[6].x = cuda_swab32(msg[6].y); + block[6].y = cuda_swab32(msg[6].x); + block[7].x = cuda_swab32(msg[7].y); + block[7].y = cuda_swab32(msg[7].x); + + block[8] = vectorizehigh(0x80000000); + block[9] = vectorizelow(0x0); + block[10] = vectorizelow(0x0); + block[11] = vectorizelow(0x0); + block[12] = vectorizelow(0x0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0x0); + block[15] = vectorizelow(0x200); + + const uint2 h[8] = { + { 0xf3bcc908UL, 0x6a09e667UL }, + { 0x84caa73bUL, 0xbb67ae85UL }, + { 0xfe94f82bUL, 0x3c6ef372UL }, + { 0x5f1d36f1UL, 0xa54ff53aUL }, + { 0xade682d1UL, 0x510e527fUL }, + { 0x2b3e6c1fUL, 0x9b05688cUL }, + { 0xfb41bd6bUL, 0x1f83d9abUL }, + { 0x137e2179UL, 0x5be0cd19UL } + }; + + uint2 v[16] = { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + c_512_u2[0], c_512_u2[1], c_512_u2[2], c_512_u2[3], + c_512_u2[4], c_512_u2[5], c_512_u2[6], c_512_u2[7] + }; + v[12].x ^= 512U; + v[13].x ^= 512U; + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + + #if __CUDA_ARCH__ == 500 + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + #else + + for (int i = 0; i < 6; i++) + { + /* column step */ + G(0, 4, 8, 12, 0); + G(1, 5, 9, 13, 2); + G(2, 6, 10, 14, 4); + G(3, 7, 11, 15, 6); + /* diagonal step */ + G(0, 5, 10, 15, 8); + G(1, 6, 11, 12, 10); + G(2, 7, 8, 13, 12); + G(3, 4, 9, 14, 14); + } + #endif + + v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]); + v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]); + v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]); + v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]); + v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]); + v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]); + v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]); + v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]); + + phash = (uint2x4*)v; + outpt = (uint2x4*)&g_hash[hashPosition * 8]; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + } +} + + +__global__ +__launch_bounds__(128, 8) +void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nounce = startNounce + thread; + uint2 block[16]; + + block[0] = c_PaddedM[0]; + block[1] = c_PaddedM[1]; + block[2] = c_PaddedM[2]; + block[3] = c_PaddedM[3]; + block[4] = c_PaddedM[4]; + block[5] = c_PaddedM[5]; + block[6] = c_PaddedM[6]; + block[7] = c_PaddedM[7]; + block[8] = c_PaddedM[8]; + block[9].y = c_PaddedM[9].y; + block[10] = vectorizehigh(0x80000000); + block[11] = vectorizelow(0); + block[12] = vectorizelow(0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0); + block[15] = vectorizelow(0x280); + block[9].x = nounce; + + const uint2 h[8] = { + { 0xf3bcc908UL, 0x6a09e667UL }, + { 0x84caa73bUL, 0xbb67ae85UL }, + { 0xfe94f82bUL, 0x3c6ef372UL }, + { 0x5f1d36f1UL, 0xa54ff53aUL }, + { 0xade682d1UL, 0x510e527fUL }, + { 0x2b3e6c1fUL, 0x9b05688cUL }, + { 0xfb41bd6bUL, 0x1f83d9abUL }, + { 0x137e2179UL, 0x5be0cd19UL } + }; + + uint2 v[16]; + uint2x4 *outpt = (uint2x4*)v; + outpt[0] = Hostprecalc[0]; + outpt[1] = Hostprecalc[1]; + outpt[2] = Hostprecalc[2]; + outpt[3] = Hostprecalc[3]; + + v[0] += (block[9] ^ c_512_u2[8]); + v[15] = ROR16(v[15] ^ v[0]); + v[10] += v[15]; + v[5] = ROR2(v[5] ^ v[10], 11); + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + +// Gprecalc(1, 5, 9, 13, 0x8, 0x4) + v[1] += v[5]; + v[13] = eorswap32(v[13], v[1]); + v[9] += v[13]; + + v[5] = ROR2(v[5] ^ v[9], 25); + v[1] += (block[8] ^ c_512_u2[4]) + v[5]; + v[13] = ROR16(v[13] ^ v[1]); + v[9] += v[13]; + v[5] = ROR2(v[5] ^ v[9], 11); + +// Gprecalc(2, 6, 10, 14, 0xf, 0x9) + v[2] += (block[9] ^ c_512_u2[0xf]); + v[14] = eorswap32(v[14], v[2]); + v[10] += v[14]; + v[6] = ROR2(v[6] ^ v[10], 25); + v[2] += (block[0xf] ^ c_512_u2[9]) + v[6]; + v[14] = ROR16(v[14] ^ v[2]); + v[10] += v[14]; + v[6] = ROR2(v[6] ^ v[10], 11); + +// Gprecalc(3, 7, 11, 15, 0x6, 0xd) + v[15] = eorswap32( v[15] , v[3]); + v[11] += v[15]; + v[7] = ROR2(v[7] ^ v[11], 25); + v[3] += (block[6] ^ c_512_u2[0xd]) + v[7]; + v[15] = ROR16(v[15] ^ v[3]); + v[11] += v[15]; + v[7] = ROR2(v[7] ^ v[11], 11); + + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]); + v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]); + v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]); + v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]); + v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]); + v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]); + v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]); + v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]); + + uint2x4 *phash = (uint2x4*)v; + outpt = (uint2x4*) &outputHash[thread * 8U]; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + } +} + +// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ + +__host__ void quark_blake512_cpu_init_sp(int thr_id) +{ + CUDA_SAFE_CALL(cudaMalloc(&d_PaddedMessage80[thr_id], 10 * sizeof(uint2))); +} + +__host__ void quark_blake512_cpu_free_sp(int thr_id) +{ + cudaFree(d_PaddedMessage80[thr_id]); +} + +__host__ void quark_blake512_cpu_setBlock_80_sp(uint64_t *pdata) +{ + uint64_t PaddedMessage[10]; + for (int i = 0; i < 10; i++) + PaddedMessage[i] = cuda_swab64(pdata[i]); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + + uint64_t block[16]; + + uint64_t *peker = (uint64_t *)&PaddedMessage[0]; + + block[0] = peker[0]; + block[1] = peker[1]; + block[2] = peker[2]; + block[3] = peker[3]; + block[4] = peker[4]; + block[5] = peker[5]; + block[6] = peker[6]; + block[7] = peker[7]; + block[8] = peker[8]; + block[9] = peker[9]; + block[10] = 0x8000000000000000; + block[11] = 0; + block[12] = 0; + block[13] = 1; + block[14] = 0; + block[15] = 0x280; + + const uint64_t u512[16] = { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL + }; + + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL + }; + + uint64_t v[16] = { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640U, u512[5] ^ 640U, u512[6], u512[7] + }; + + GprecalcHost(0, 4, 8, 12, 0x1, 0x0) + GprecalcHost(1, 5, 9, 13, 0x3, 0x2) + GprecalcHost(2, 6, 10, 14, 0x5, 0x4) + GprecalcHost(3, 7, 11, 15, 0x7, 0x6) + + GprecalcHost(1, 6, 11, 12, 0xb, 0xa) + GprecalcHost(2, 7, 8, 13, 0xd, 0xc) + + v[0] += (block[8] ^ u512[9]) + v[5]; + v[15] = ROTR64(v[15] ^ v[0], 32); \ + v[10] += v[15]; + v[5] = ROTR64(v[5] ^ v[10], 25); + v[0] += v[5]; + + GprecalcHost(3, 4, 9, 14, 0xf, 0xe); + + v[1] += (block[0x4] ^ u512[0x8]); + v[2] += v[6]; + + v[3] += (block[0xd] ^ u512[6]) + v[7]; + + //applog_hash((unsigned char*) &v[0]); + //applog_hash((unsigned char*) &v[4]); + //applog_hash((unsigned char*) &v[8]); + //applog_hash((unsigned char*) &v[12]); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, v, 128, 0, cudaMemcpyHostToDevice)); +} +#else +// __CUDA_ARCH__ < 500 +__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {} +__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {} +#endif + +__host__ +void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash) +{ + const uint32_t threadsperblock = 32; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + quark_blake512_gpu_hash_64_sp <<>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash); +} + +__host__ +void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +{ + const uint32_t threadsperblock = 64; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + quark_blake512_gpu_hash_80_sp <<>>(threads, startNounce, (uint2 *)d_outputHash); +} diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index f615444..a95ef92 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -20,6 +20,7 @@ static uint32_t *d_branch2Nonces[MAX_GPUS]; static uint32_t *d_branch3Nonces[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -57,80 +58,95 @@ extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t st // Original Quarkhash Funktion aus einem miner Quelltext extern "C" void quarkhash(void *state, const void *input) { - sph_blake512_context ctx_blake; - sph_bmw512_context ctx_bmw; - sph_groestl512_context ctx_groestl; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - - unsigned char hash[64]; - - sph_blake512_init(&ctx_blake); - sph_blake512 (&ctx_blake, input, 80); - sph_blake512_close(&ctx_blake, (void*) hash); - - sph_bmw512_init(&ctx_bmw); - sph_bmw512 (&ctx_bmw, (const void*) hash, 64); - sph_bmw512_close(&ctx_bmw, (void*) hash); - - if (hash[0] & 0x8) - { - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, (const void*) hash, 64); - sph_groestl512_close(&ctx_groestl, (void*) hash); - } - else - { - sph_skein512_init(&ctx_skein); - sph_skein512 (&ctx_skein, (const void*) hash, 64); - sph_skein512_close(&ctx_skein, (void*) hash); - } - - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, (const void*) hash, 64); - sph_groestl512_close(&ctx_groestl, (void*) hash); - - sph_jh512_init(&ctx_jh); - sph_jh512 (&ctx_jh, (const void*) hash, 64); - sph_jh512_close(&ctx_jh, (void*) hash); - - if (hash[0] & 0x8) - { - sph_blake512_init(&ctx_blake); - sph_blake512 (&ctx_blake, (const void*) hash, 64); - sph_blake512_close(&ctx_blake, (void*) hash); - } - else - { - sph_bmw512_init(&ctx_bmw); - sph_bmw512 (&ctx_bmw, (const void*) hash, 64); - sph_bmw512_close(&ctx_bmw, (void*) hash); - } - - sph_keccak512_init(&ctx_keccak); - sph_keccak512 (&ctx_keccak, (const void*) hash, 64); - sph_keccak512_close(&ctx_keccak, (void*) hash); - - sph_skein512_init(&ctx_skein); - sph_skein512 (&ctx_skein, (const void*) hash, 64); - sph_skein512_close(&ctx_skein, (void*) hash); - - if (hash[0] & 0x8) - { - sph_keccak512_init(&ctx_keccak); - sph_keccak512 (&ctx_keccak, (const void*) hash, 64); - sph_keccak512_close(&ctx_keccak, (void*) hash); - } - else - { - sph_jh512_init(&ctx_jh); - sph_jh512 (&ctx_jh, (const void*) hash, 64); - sph_jh512_close(&ctx_jh, (void*) hash); - } - - memcpy(state, hash, 32); + unsigned char _ALIGN(128) hash[64]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, input, 80); + sph_blake512_close(&ctx_blake, (void*) hash); + + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx_bmw, (void*) hash); + + if (hash[0] & 0x8) + { + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + } + else + { + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + } + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + + if (hash[0] & 0x8) + { + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, (const void*) hash, 64); + sph_blake512_close(&ctx_blake, (void*) hash); + } + else + { + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx_bmw, (void*) hash); + } + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + + if (hash[0] & 0x8) + { + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + } + else + { + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + } + + memcpy(state, hash, 32); +} + +#ifdef _DEBUG +#define TRACE(algo) { \ + if (max_nonce == 1 && pdata[19] <= 1) { \ + uint32_t* debugbuf = NULL; \ + cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \ + cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ + printf("quark %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + swab32(debugbuf[2]), swab32(debugbuf[3])); \ + cudaFreeHost(debugbuf); \ + } \ } +#else +#define TRACE(algo) {} +#endif static bool init[MAX_GPUS] = { 0 }; @@ -141,18 +157,17 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - uint32_t throughput = cuda_default_throughput(thr_id, 1 << 20); // 256*4096 + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096 if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00F; + ptarget[7] = 0x00F; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - // Konstanten kopieren, Speicher belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -180,8 +195,8 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; uint32_t nrm1=0, nrm2=0, nrm3=0; - // erstes Blake512 Hash mit CUDA quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake :"); // das ist der unbedingte Branch für BMW512 quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); @@ -189,7 +204,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, d_branch3Nonces[thr_id], &nrm3, order++); - + // nur den Skein Branch weiterverfolgen quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); @@ -223,28 +238,26 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, d_branch2Nonces[thr_id], &nrm2, order++); - // das ist der bedingte Branch für Keccak512 quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der bedingte Branch für JH512 quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + 1; - // Scan nach Gewinner Hashes auf der GPU uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) + if (foundNonce != UINT32_MAX) { - uint32_t vhash64[8]; + uint32_t vhash[8]; be32enc(&endiandata[19], foundNonce); - quarkhash(vhash64, endiandata); + quarkhash(vhash, endiandata); - if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { - work_set_target_ratio(work, vhash64); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work_set_target_ratio(work, vhash); pdata[19] = foundNonce; return 1; } else { - applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + applog_hash((uchar*) vhash); + applog_hash((uchar*) ptarget); } } @@ -270,6 +283,7 @@ extern "C" void free_quark(int thr_id) cudaFree(d_branch2Nonces[thr_id]); cudaFree(d_branch3Nonces[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); quark_compactTest_cpu_free(thr_id); diff --git a/x11/c11.cu b/x11/c11.cu index 5db4cb6..dc2a527 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -23,6 +23,7 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -257,6 +258,7 @@ extern "C" void free_c11(int thr_id) cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); diff --git a/x11/x11.cu b/x11/x11.cu index 9040a4b..006fdda 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -23,6 +23,7 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -255,6 +256,7 @@ extern "C" void free_x11(int thr_id) cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); diff --git a/x13/x13.cu b/x13/x13.cu index 5859029..f9c53ca 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -26,6 +26,7 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -263,6 +264,7 @@ extern "C" void free_x13(int thr_id) cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); diff --git a/x15/x14.cu b/x15/x14.cu index 2696c4d..005674f 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -30,6 +30,7 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS] = { 0 }; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -269,6 +270,7 @@ extern "C" void free_x14(int thr_id) cudaThreadSynchronize(); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); diff --git a/x15/x15.cu b/x15/x15.cu index 5f9eab5..7965c96 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -31,6 +31,7 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS] = { 0 }; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -277,6 +278,7 @@ extern "C" void free_x15(int thr_id) cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); diff --git a/x17/x17.cu b/x17/x17.cu index a5198df..b5bfb83 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -34,6 +34,7 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); +extern void quark_blake512_cpu_free(int thr_id); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); @@ -301,6 +302,7 @@ extern "C" void free_x17(int thr_id) cudaFree(d_hash[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id);