From 940c1b3a2f5c26848d116c029275f82036434bc1 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 16 Jul 2016 12:24:50 +0200 Subject: [PATCH] lbry: small changes for second build sha512/ripemd swab this was preview 2 --- lbry/cuda_ripemd160.cu | 67 +++++++++------------------------------- lbry/cuda_sha256_lbry.cu | 28 +++++++---------- lbry/cuda_sha512_lbry.cu | 36 ++++++++++----------- lbry/lbry.cu | 2 +- 4 files changed, 44 insertions(+), 89 deletions(-) diff --git a/lbry/cuda_ripemd160.cu b/lbry/cuda_ripemd160.cu index 1ae7f35..bc4406e 100644 --- a/lbry/cuda_ripemd160.cu +++ b/lbry/cuda_ripemd160.cu @@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u }; +__device__ __forceinline__ +uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { + uint32_t result; +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA + : "=r"(result) : "r"(a), "r"(b),"r"(c)); +#else + result = a^b^c; +#endif + return result; +} + //__host__ //uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) { // return c ^ (a | !b); @@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) #define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) #define F5(x, y, z) ((x) ^ ((y) | ~(z))) #else -#define F1(x, y, z) xor3(x,y,z) +#define F1(x, y, z) xor3b(x,y,z) #define F2(x, y, z) xandx(x,y,z) #define F3(x, y, z) xornot64(x,y,z) #define F4(x, y, z) xandx(z,x,y) @@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) h[0] = tmp; \ } -#if 0 -__global__ -void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]); - - uint32_t in[16]; - for (int i=0; i<8; i++) - in[i] = (hash[i]); - in[8] = 0x80; - - #pragma unroll - for (int i=9;i<16;i++) in[i] = 0; - - in[14] = 0x100; // size in bits - - uint32_t h[5]; - #pragma unroll - for (int i=0; i<5; i++) - h[i] = c_IV[i]; - - RIPEMD160_ROUND_BODY(in, h); - - #pragma unroll - for (int i=0; i<5; i++) - hash[i] = h[i]; - -#ifdef PAD_ZEROS - // 20 bytes hash on 32 or 64 bytes output space - hash[5] = 0; - hash[6] = 0; - hash[7] = 0; -#endif - } -} - -__host__ -void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream) -{ - const uint32_t threadsperblock = 128; - - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); - - lbry_ripemd160_gpu_hash_32 <<>> (threads, (uint64_t*) g_Hash, byteOffset); -} -#endif - __global__ -//__launch_bounds__(256,6) void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]); uint32_t in[16]; + #pragma unroll for (int i=0; i<8; i++) in[i] = (hash[i]); in[8] = 0x80; diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu index db0c41c..c75325e 100644 --- a/lbry/cuda_sha256_lbry.cu +++ b/lbry/cuda_sha256_lbry.cu @@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { return result; } -/* -__device__ __forceinline__ -uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { - uint32_t result; - asm("{ .reg .u32 t1; // xor3b \n\t" - "xor.b32 t1, %2, %3;\n\t" - "xor.b32 %0, %1, t1;" - "}" - : "=r"(result) : "r"(a) ,"r"(b),"r"(c)); - return result; -} -#define xor3b(a,b,c) (a ^ b ^ c) -*/ - __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) { uint32_t r1 = ROTR32(x,2); @@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, return result; } +__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.y), "=r"(result.x) : "l"(v)); + return result; +} + __device__ static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, uint32_t in, const uint32_t Kshared) @@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512) uint2* output = (uint2*) input; #pragma unroll for (int i=0;i<4;i++) { - output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + output[i] = vectorizeswap(((uint64_t*)buf)[i]); } #ifdef PAD_ZEROS #pragma unroll @@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce uint2* output = (uint2*) (&outputHash[thread * 8U]); #pragma unroll for (int i=0;i<4;i++) { - output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); - //output[i] = vectorize(((uint64_t*)buf)[i]); + // //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + output[i] = vectorizeswap(((uint64_t*)buf)[i]); } } } diff --git a/lbry/cuda_sha512_lbry.cu b/lbry/cuda_sha512_lbry.cu index 39f549b..79c0906 100644 --- a/lbry/cuda_sha512_lbry.cu +++ b/lbry/cuda_sha512_lbry.cu @@ -6,6 +6,7 @@ #include #include +//#define USE_ROT_ASM_OPT 0 #include static __constant__ uint64_t K_512[80]; @@ -36,8 +37,6 @@ static const uint64_t K512[80] = { //#undef xor3 //#define xor3(a,b,c) (a^b^c) -//#undef - static __device__ __forceinline__ uint64_t bsg5_0(const uint64_t x) { @@ -111,26 +110,17 @@ __global__ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + //if (thread < threads) { uint64_t *pHash = &g_hash[thread * 8U]; uint64_t W[80]; - uint64_t r[8]; - - uint64_t IV512[8] = { - 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, - 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 - }; - - #pragma unroll - for (int i = 0; i < 8; i++) - r[i] = IV512[i]; #pragma unroll for (int i = 0; i < 4; i++) { // 32 bytes input - W[i] = cuda_swab64(pHash[i]); + W[i] = pHash[i]; + //W[i] = cuda_swab64(pHash[i]); // made in sha256 } W[4] = 0x8000000000000000; // end tag @@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) W[15] = 0x100; // 256 bits - #pragma unroll - for (int i = 16; i < 80; i++) W[i] = 0; + //#pragma unroll + //for (int i = 16; i < 78; i++) W[i] = 0; - #pragma unroll 64 + #pragma unroll for (int i = 16; i < 80; i++) W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16]; + const uint64_t IV512[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 + }; + + uint64_t r[8]; + #pragma unroll + for (int i = 0; i < 8; i++) + r[i] = IV512[i]; + #pragma unroll 10 for (int i = 0; i < 10; i++) { #pragma unroll 8 @@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size = 80*8; + size_t shared_size = 0; lbry_sha512_gpu_hash_32 <<>> (threads, (uint64_t*)d_hash); } diff --git a/lbry/lbry.cu b/lbry/lbry.cu index f5da268..b81cc79 100644 --- a/lbry/lbry.cu +++ b/lbry/lbry.cu @@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) { - ptarget[7] = 0xff; + ptarget[7] = 0xf; } if (!init[thr_id]){