diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index a644452..418ca07 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -1,8 +1,10 @@ /** -* Blake-256 Cuda Kernel (Tested on SM 5.0) -* -* Tanguy Pruvot - Nov. 2014 -*/ + * Blake-256 Cuda Kernel (Tested on SM 5.0) + * + * Tanguy Pruvot - Nov. 2014 + * + * + merged blake+keccak kernel for lyra2v2 + */ extern "C" { #include "sph/sph_blake.h" } @@ -14,20 +16,17 @@ extern "C" { #ifdef __INTELLISENSE__ /* just for vstudio code colors */ __device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c); - #endif + #define UINT2(x,y) make_uint2(x,y) -__device__ __inline__ uint2 ROR8(const uint2 a) -{ +__device__ __inline__ uint2 ROR8(const uint2 a) { uint2 result; result.x = __byte_perm(a.y, a.x, 0x0765); result.y = __byte_perm(a.x, a.y, 0x0765); - return result; } - static __device__ uint64_t cuda_swab32ll(uint64_t x) { return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); } @@ -193,12 +192,12 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3 m[2] = block[2]; m[3] = block[3]; -#pragma unroll + #pragma unroll for (int i = 4; i < 16; i++) { m[i] = c_Padding[i]; } -#pragma unroll 8 + #pragma unroll 8 for (int i = 0; i < 8; i++) v[i] = h[i]; @@ -212,7 +211,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3 v[14] = u256[6]; v[15] = u256[7]; -#pragma unroll 14 + #pragma unroll 14 for (int r = 0; r < 14; r++) { /* column step */ GS2(0, 4, 0x8, 0xC, 0x0); @@ -226,7 +225,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3 GS2(3, 4, 0x9, 0xE, 0xE); } -#pragma unroll 16 + #pragma unroll 16 for (int i = 0; i < 16; i++) { int j = i & 7; h[j] ^= v[i]; @@ -238,10 +237,10 @@ static void __forceinline__ __device__ keccak_block(uint2 *s) uint2 bc[5], tmpxor[5], u, v; // uint2 s[25]; -#pragma unroll 1 + #pragma unroll 1 for (int i = 0; i < 24; i++) { -#pragma unroll + #pragma unroll for (uint32_t x = 0; x < 5; x++) tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; @@ -297,10 +296,10 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc if (thread < threads) { const uint32_t nonce = startNonce + thread; - uint32_t h[8]; - // uint32_t input[4]; const uint32_t T0 = 640; -#pragma unroll 8 + + uint32_t h[8]; + #pragma unroll 8 for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; } uint32_t v[16]; @@ -311,8 +310,7 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc 0, 1, 0, 640 }; - const uint32_t u256[16] = - { + const uint32_t u256[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, @@ -323,15 +321,14 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc 0x3F84D5B5, 0xB5470917 }; - uint32_t m[16] = - { + uint32_t m[16] = { c_data[0], c_data[1], c_data[2], nonce, c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] }; -#pragma unroll 8 + #pragma unroll 8 for (int i = 0; i < 8; i++) v[i] = h[i]; @@ -380,7 +377,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(1, 6, 0xB, 0xC, 5, 10); GSPREC(2, 7, 0x8, 0xD, 4, 0); GSPREC(3, 4, 0x9, 0xE, 15, 8); - // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, GSPREC(0, 4, 0x8, 0xC, 9, 0); GSPREC(1, 5, 0x9, 0xD, 5, 7); @@ -399,7 +395,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(1, 6, 0xB, 0xC, 7, 5); GSPREC(2, 7, 0x8, 0xD, 15, 14); GSPREC(3, 4, 0x9, 0xE, 1, 9); - // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, GSPREC(0, 4, 0x8, 0xC, 12, 5); GSPREC(1, 5, 0x9, 0xD, 1, 15); @@ -409,7 +404,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(1, 6, 0xB, 0xC, 6, 3); GSPREC(2, 7, 0x8, 0xD, 9, 2); GSPREC(3, 4, 0x9, 0xE, 8, 11); - // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, GSPREC(0, 4, 0x8, 0xC, 13, 11); GSPREC(1, 5, 0x9, 0xD, 7, 14); @@ -446,7 +440,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(1, 6, 0xB, 0xC, 10, 11); GSPREC(2, 7, 0x8, 0xD, 12, 13); GSPREC(3, 4, 0x9, 0xE, 14, 15); - // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, GSPREC(0, 4, 0x8, 0xC, 14, 10); GSPREC(1, 5, 0x9, 0xD, 4, 8); @@ -456,7 +449,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(1, 6, 0xB, 0xC, 0, 2); GSPREC(2, 7, 0x8, 0xD, 11, 7); GSPREC(3, 4, 0x9, 0xE, 5, 3); - // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, GSPREC(0, 4, 0x8, 0xC, 11, 8); GSPREC(1, 5, 0x9, 0xD, 12, 0); @@ -476,9 +468,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc GSPREC(2, 7, 0x8, 0xD, 4, 0); GSPREC(3, 4, 0x9, 0xE, 15, 8); - - - h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); @@ -501,14 +490,12 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc keccak_gpu_state[16] = UINT2(0, 0x80000000); keccak_block(keccak_gpu_state); + uint64_t *outputHash = (uint64_t *)Hash; -#pragma unroll 4 + #pragma unroll 4 for (int i = 0; i<4; i++) outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]); } - - - } __global__ __launch_bounds__(256, 3) @@ -520,16 +507,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin uint32_t h[8]; uint32_t input[4]; -#pragma unroll + #pragma unroll for (int i = 0; i < 8; i++) h[i] = cpu_h[i]; -#pragma unroll + #pragma unroll for (int i = 0; i < 3; ++i) input[i] = c_data[i]; input[3] = startNonce + thread; blake256_compress2nd(h, input, 640); -#pragma unroll + #pragma unroll for (int i = 0; i<4; i++) { Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2 * i + 1])); } @@ -568,6 +555,8 @@ void blake256_cpu_init(int thr_id, uint32_t threads) cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); } +/** for lyra2v2 **/ + __host__ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) { @@ -576,7 +565,7 @@ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - blakeKeccak256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); + blakeKeccak256_gpu_hash_80 <<>> (threads, startNonce, (uint32_t *)Hash); } __host__ @@ -587,5 +576,5 @@ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - blakeKeccak256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); -} \ No newline at end of file + blakeKeccak256_gpu_hash_80 <<>> (threads, startNonce, (uint32_t *)Hash); +} diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu index b067ab5..153e87a 100644 --- a/Algo256/cuda_cubehash256.cu +++ b/Algo256/cuda_cubehash256.cu @@ -267,9 +267,9 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) } #if __CUDA_ARCH__ >= 500 -__global__ __launch_bounds__(TPB50, 1) +__global__ __launch_bounds__(TPB50, 1) #else -__global__ __launch_bounds__(TPB35, 1) +__global__ __launch_bounds__(TPB35, 1) #endif void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) { @@ -354,8 +354,9 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); - cubehash256_gpu_hash_32 << > > (threads, startNounce, (uint2*)d_hash); + cubehash256_gpu_hash_32 <<>> (threads, startNounce, (uint2*)d_hash); } + __host__ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream) { @@ -365,5 +366,5 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); - cubehash256_gpu_hash_32 << > > (threads, startNounce, (uint2*)d_hash); -} \ No newline at end of file + cubehash256_gpu_hash_32 <<>> (threads, startNounce, (uint2*)d_hash); +} diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu index 9a3874a..7e87bb2 100644 --- a/Algo256/cuda_keccak256.cu +++ b/Algo256/cuda_keccak256.cu @@ -212,6 +212,7 @@ void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, ui memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t)); } +#if 0 #if __CUDA_ARCH__ <= 500 __global__ __launch_bounds__(TPB50, 2) #else @@ -306,6 +307,7 @@ void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_has keccak256_gpu_hash_32 <<>> (threads, d_hash); } +#endif __host__ void keccak256_setBlock_80(uint64_t *endiandata) diff --git a/Algo256/cuda_keccak256_sm3.cu b/Algo256/cuda_keccak256_sm3.cu index e8bb42c..ff46932 100644 --- a/Algo256/cuda_keccak256_sm3.cu +++ b/Algo256/cuda_keccak256_sm3.cu @@ -231,6 +231,7 @@ uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounc return result; } +#if 0 __global__ __launch_bounds__(256,3) void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) { @@ -282,6 +283,7 @@ void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, u keccak256_sm3_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); MyStreamSynchronize(NULL, order, thr_id); } +#endif __host__ void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn) diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index 69847c8..d3aeabc 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.2.3" +#define PACKAGE_VERSION "2.2.4" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/configure.ac b/configure.ac index d284cf6..906f31d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2.3], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 96641b3..b3ad49f 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -13,12 +13,14 @@ static uint64_t* d_hash[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); -extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void blake256_cpu_setBlock_80(uint32_t *pdata); +//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); -extern void keccak256_sm3_init(int thr_id, uint32_t threads); -extern void keccak256_sm3_free(int thr_id); +//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void keccak256_sm3_init(int thr_id, uint32_t threads); +//extern void keccak256_sm3_free(int thr_id); + +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); @@ -98,10 +100,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); blake256_cpu_init(thr_id, throughput); - keccak256_sm3_init(thr_id, throughput); + //keccak256_sm3_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput); + //cuda_get_arch(thr_id); if (device_sm[dev_id] >= 500) { size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; @@ -124,8 +127,9 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, do { int order = 0; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); @@ -187,7 +191,7 @@ extern "C" void free_lyra2(int thr_id) cudaFree(d_hash[thr_id]); cudaFree(d_matrix[thr_id]); - keccak256_sm3_free(thr_id); + //keccak256_sm3_free(thr_id); groestl256_cpu_free(thr_id); init[thr_id] = false; diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index a7298a2..715f311 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -14,12 +14,14 @@ static uint64_t *d_hash[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); -extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void blake256_cpu_setBlock_80(uint32_t *pdata); +//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); -extern void keccak256_sm3_init(int thr_id, uint32_t threads); -extern void keccak256_sm3_free(int thr_id); +//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void keccak256_sm3_init(int thr_id, uint32_t threads); +//extern void keccak256_sm3_free(int thr_id); + +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); @@ -103,10 +105,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); blake256_cpu_init(thr_id, throughput); - keccak256_sm3_init(thr_id,throughput); + //keccak256_sm3_init(thr_id,throughput); skein256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput); + cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256 + // SM 3 implentation requires a bit more memory if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; @@ -130,8 +134,9 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc do { int order = 0; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); @@ -196,7 +201,7 @@ extern "C" void free_lyra2v2(int thr_id) cudaFree(d_matrix[thr_id]); bmw256_cpu_free(thr_id); - keccak256_sm3_free(thr_id); + //keccak256_sm3_free(thr_id); init[thr_id] = false; diff --git a/lyra2/lyra2Z.cu b/lyra2/lyra2Z.cu index eb01d58..7d84b3c 100644 --- a/lyra2/lyra2Z.cu +++ b/lyra2/lyra2Z.cu @@ -60,6 +60,7 @@ extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce CUDA_LOG_ERROR(); } + cuda_get_arch(thr_id); int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; if (device_sm[dev_id] <= 500) intensity = 15; throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;