From 163430daae37163034a1e802560efd6bb614b52f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 15 Apr 2015 01:12:47 +0200 Subject: [PATCH] Skein/Skein2 SM 3.0 devices support + code cleanup Signed-off-by: Tanguy Pruvot --- README.txt | 1 + quark/cuda_skein512.cu | 311 ++++++++++++++++++++++++++++++----------- skein.cu | 33 +++-- skein2.cu | 15 +- 4 files changed, 261 insertions(+), 99 deletions(-) diff --git a/README.txt b/README.txt index f169ee4..8d014e6 100644 --- a/README.txt +++ b/README.txt @@ -192,6 +192,7 @@ features. Apr. 14th 2015 v1.6.1 Add the Double Skein Algo for Woodcoin + Skein/Skein2 SM 3.0 devices support Mar. 27th 2015 v1.6.0 Add the ZR5 Algo for Ziftcoin diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index f9f64a0..669ec4a 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -451,7 +451,7 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t } __global__ -void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) +void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -464,7 +464,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + uint64_t *inpHash = &g_hash[hashPosition * 8]; // Init h0 = 0x4903ADFF749C51CEull; @@ -542,7 +542,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); // output - uint64_t *outpHash = &g_hash[8 * hashPosition]; + uint64_t *outpHash = &g_hash[hashPosition * 8]; #pragma unroll 8 for(int i=0; i<8; i++) @@ -550,57 +550,6 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint } } -__global__ __launch_bounds__(128,6) -void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 t0 = vectorize(8); // extra - uint2 t1 = vectorize(0xFF00000000000000ull); // etype - uint2 t2 = vectorize(0xB000000000000050ull); - - uint64_t *state = &g_hash[8 * thread]; - uint2 h0 = vectorize(state[0]); - uint2 h1 = vectorize(state[1]); - uint2 h2 = vectorize(state[2]); - uint2 h3 = vectorize(state[3]); - uint2 h4 = vectorize(state[4]); - uint2 h5 = vectorize(state[5]); - uint2 h6 = vectorize(state[6]); - uint2 h7 = vectorize(state[7]); - uint2 h8; - TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - - uint2 p[8] = { 0 }; - - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint64_t *outpHash = state; - #pragma unroll 8 - for (int i = 0; i < 8; i++) - outpHash[i] = devectorize(p[i]); - } -} - __global__ __launch_bounds__(128,5) void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap) { @@ -695,25 +644,219 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp TFBIG_4o_UI2(17); TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - h0 = vectorize(c_PaddedMessage80[8]) ^ p[0]; - h1 = nounce2 ^ p[1]; - h2 = p[2]; - h3 = p[3]; - h4 = p[4]; - h5 = p[5]; - h6 = p[6]; - h7 = p[7]; + uint64_t *outpHash = &output64[thread * 8]; + outpHash[0] = c_PaddedMessage80[8] ^ devectorize(p[0]); + outpHash[1] = devectorize(nounce2 ^ p[1]); + #pragma unroll + for(int i=2; i<8; i++) + outpHash[i] = devectorize(p[i]); + } +} + +__global__ +void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; + uint64_t t0, t1, t2; + + // Init + h0 = 0x4903ADFF749C51CEull; + h1 = 0x0D95DE399746DF03ull; + h2 = 0x8FD1934127C79BCEull; + h3 = 0x9A255629FF352CB1ull; + h4 = 0x5DB62599DF6CA7B0ull; + h5 = 0xEABE394CA9D5C3F4ull; + h6 = 0x991112C71A75B523ull; + h7 = 0xAE18A40B660FCC33ull; + + t0 = 64; // ptr + //t1 = vectorize(0xE0ull << 55); // etype + t1 = 0x7000000000000000ull; + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); + + uint64_t p[8]; + #pragma unroll 8 + for (int i = 0; i<8; i++) + p[i] = c_PaddedMessage80[i]; + + TFBIG_4e(0); + TFBIG_4o(1); + TFBIG_4e(2); + TFBIG_4o(3); + TFBIG_4e(4); + TFBIG_4o(5); + TFBIG_4e(6); + TFBIG_4o(7); + TFBIG_4e(8); + TFBIG_4o(9); + TFBIG_4e(10); + TFBIG_4o(11); + TFBIG_4e(12); + TFBIG_4o(13); + TFBIG_4e(14); + TFBIG_4o(15); + TFBIG_4e(16); + TFBIG_4o(17); + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + h0 = c_PaddedMessage80[0] ^ p[0]; + h1 = c_PaddedMessage80[1] ^ p[1]; + h2 = c_PaddedMessage80[2] ^ p[2]; + h3 = c_PaddedMessage80[3] ^ p[3]; + h4 = c_PaddedMessage80[4] ^ p[4]; + h5 = c_PaddedMessage80[5] ^ p[5]; + h6 = c_PaddedMessage80[6] ^ p[6]; + h7 = c_PaddedMessage80[7] ^ p[7]; + + uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread; + uint64_t nonce64 = MAKE_ULONGLONG(_LOWORD(c_PaddedMessage80[9]), nonce); + + // skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16 + p[0] = c_PaddedMessage80[8]; + p[1] = nonce64; + + #pragma unroll + for (int i = 2; i < 8; i++) + p[i] = 0ull; + + t0 = 0x50ull; // SPH_T64(bcount << 6) + (sph_u64)(extra); + t1 = 0xB000000000000000ull; // (bcount >> 58) + ((sph_u64)(etype) << 55); + + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); + TFBIG_4e(0); + TFBIG_4o(1); + TFBIG_4e(2); + TFBIG_4o(3); + TFBIG_4e(4); + TFBIG_4o(5); + TFBIG_4e(6); + TFBIG_4o(7); + TFBIG_4e(8); + TFBIG_4o(9); + TFBIG_4e(10); + TFBIG_4o(11); + TFBIG_4e(12); + TFBIG_4o(13); + TFBIG_4e(14); + TFBIG_4o(15); + TFBIG_4e(16); + TFBIG_4o(17); + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); // skein_big_close 2nd loop -> etype = 0x1fe, ptr = 8, bcount = 0 + // output uint64_t *outpHash = &output64[thread * 8]; - outpHash[0] = devectorize(h0); - outpHash[1] = devectorize(h1); - outpHash[2] = devectorize(h2); - outpHash[3] = devectorize(h3); - outpHash[4] = devectorize(h4); - outpHash[5] = devectorize(h5); - outpHash[6] = devectorize(h6); - outpHash[7] = devectorize(h7); + outpHash[0] = c_PaddedMessage80[8] ^ p[0]; + outpHash[1] = nonce64 ^ p[1]; + #pragma unroll + for(int i=2; i<8; i++) + outpHash[i] = p[i]; + } +} + +__global__ __launch_bounds__(128,6) +void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2 t0 = vectorize(8); // extra + uint2 t1 = vectorize(0xFF00000000000000ull); // etype + uint2 t2 = vectorize(0xB000000000000050ull); + + uint64_t *state = &g_hash[thread * 8]; + uint2 h0 = vectorize(state[0]); + uint2 h1 = vectorize(state[1]); + uint2 h2 = vectorize(state[2]); + uint2 h3 = vectorize(state[3]); + uint2 h4 = vectorize(state[4]); + uint2 h5 = vectorize(state[5]); + uint2 h6 = vectorize(state[6]); + uint2 h7 = vectorize(state[7]); + uint2 h8; + TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); + + uint2 p[8] = { 0 }; + + TFBIG_4e_UI2(0); + TFBIG_4o_UI2(1); + TFBIG_4e_UI2(2); + TFBIG_4o_UI2(3); + TFBIG_4e_UI2(4); + TFBIG_4o_UI2(5); + TFBIG_4e_UI2(6); + TFBIG_4o_UI2(7); + TFBIG_4e_UI2(8); + TFBIG_4o_UI2(9); + TFBIG_4e_UI2(10); + TFBIG_4o_UI2(11); + TFBIG_4e_UI2(12); + TFBIG_4o_UI2(13); + TFBIG_4e_UI2(14); + TFBIG_4o_UI2(15); + TFBIG_4e_UI2(16); + TFBIG_4o_UI2(17); + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + uint64_t *outpHash = state; + #pragma unroll 8 + for (int i = 0; i < 8; i++) + outpHash[i] = devectorize(p[i]); + } +} + +__global__ __launch_bounds__(128,6) +void skein512_gpu_hash_close_sm3(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t t0 = 8ull; // extra + uint64_t t1 = 0xFF00000000000000ull; // etype + uint64_t t2 = 0xB000000000000050ull; + + uint64_t *state = &g_hash[thread * 8]; + + uint64_t h0 = state[0]; + uint64_t h1 = state[1]; + uint64_t h2 = state[2]; + uint64_t h3 = state[3]; + uint64_t h4 = state[4]; + uint64_t h5 = state[5]; + uint64_t h6 = state[6]; + uint64_t h7 = state[7]; + uint64_t h8; + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); + + uint64_t p[8] = { 0 }; + + TFBIG_4e(0); + TFBIG_4o(1); + TFBIG_4e(2); + TFBIG_4o(3); + TFBIG_4e(4); + TFBIG_4o(5); + TFBIG_4e(6); + TFBIG_4o(7); + TFBIG_4e(8); + TFBIG_4o(9); + TFBIG_4e(10); + TFBIG_4o(11); + TFBIG_4e(12); + TFBIG_4o(13); + TFBIG_4e(14); + TFBIG_4o(15); + TFBIG_4e(16); + TFBIG_4o(17); + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + uint64_t *outpHash = state; + #pragma unroll 8 + for (int i = 0; i < 8; i++) + outpHash[i] = p[i]; } } @@ -738,33 +881,39 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) quark_skein512_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); else - quark_skein512_gpu_hash_64_v30 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_skein512_gpu_hash_64_sm3 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } -/* skeincoin */ +/* skein / skein2 */ __host__ void skein512_cpu_setBlock_80(void *pdata) { - uint32_t PaddedMessage[32] = { 0 }; - memcpy(&PaddedMessage[0], pdata, 80); + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, 80, 0, cudaMemcpyHostToDevice); - CUDA_SAFE_CALL( - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice) - ); + CUDA_SAFE_CALL(cudaStreamSynchronize(NULL)); } __host__ -void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap) +void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash, int swap) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // hash function is cut in 2 parts - skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, swap); - skein512_gpu_hash_close <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash); + int dev_id = device_map[thr_id]; + uint64_t *d_hash = (uint64_t*) g_hash; + + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) { + // hash function is cut in 2 parts to reduce kernel size + skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, d_hash, swap); + skein512_gpu_hash_close <<< grid, block >>> (threads, startNounce, d_hash); + } else { + // variant without uint2 variables + skein512_gpu_hash_80_sm3 <<< grid, block >>> (threads, startNounce, d_hash, swap); + skein512_gpu_hash_close_sm3 <<< grid, block >>> (threads, startNounce, d_hash); + } } diff --git a/skein.cu b/skein.cu index 76f8da9..5cb5346 100644 --- a/skein.cu +++ b/skein.cu @@ -12,6 +12,8 @@ static uint32_t *d_hash[MAX_GPUS]; +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); + extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); @@ -305,13 +307,15 @@ void sha2_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *hashBuff } __host__ -void sha2_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHashes, int order) +void sha2_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHashes) { uint32_t threadsperblock = 128; dim3 block(threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock); - //cudaMemset(d_outputHashes, 0, 64 * threads); + sha2_gpu_hash_64 <<< grid, block >>>(threads, startNounce, d_outputHashes); + + // required once per scan loop to prevent cpu 100% usage (linux) MyStreamSynchronize(NULL, 0, thr_id); } @@ -339,10 +343,11 @@ static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) { + uint32_t _ALIGN(64) endiandata[20]; + const uint32_t first_nonce = pdata[19]; const int swap = 1; @@ -357,31 +362,33 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata, cudaDeviceReset(); cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput)); + cudaMalloc(&d_hash[thr_id], throughput * 64U); + quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + init[thr_id] = true; } - uint32_t endiandata[20]; - for (int k=0; k < 20; k++) + for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); skein512_cpu_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { - int order = 0; - *hashes_done = pdata[19] - first_nonce + throughput; - // Hash with CUDA skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap); - sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + + *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash64[8]; endiandata[19] = swab32_if(foundNonce, swap); skeincoinhash(vhash64, endiandata); diff --git a/skein2.cu b/skein2.cu index 17fd51e..ed6c787 100644 --- a/skein2.cu +++ b/skein2.cu @@ -10,6 +10,8 @@ static uint32_t *d_hash[MAX_GPUS]; +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); + extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); @@ -33,9 +35,8 @@ extern "C" void skein2hash(void *output, const void *input) static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; @@ -50,9 +51,13 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, cudaDeviceReset(); cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64UL * throughput)); + cudaMalloc(&d_hash[thr_id], throughput * 64U); + + quark_skein512_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); - cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; }