From c8590419936c065e33d8c8523fc8a75d7f1333c2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 17 Nov 2014 00:00:15 +0100 Subject: [PATCH] quark/blake512 opt. pointed by sp without asm indeed, the pragma unroll doesnt always make things faster asm part... to check later --- quark/cuda_quark_blake512.cu | 26 ++++++++++++++++---------- x11/x11.cu | 1 - x13/x13.cu | 1 - x15/x14.cu | 1 - x15/x15.cu | 1 - x17/x17.cu | 1 - 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 1231f7a..01363ee 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -59,18 +59,19 @@ const uint64_t c_u512[16] = v[b] = ROTR( v[b] ^ v[c], 11); \ } -__device__ static +__device__ __forceinline__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) { - uint64_t v[16], m[16], i; + uint64_t v[16]; + uint64_t m[16]; - #pragma unroll 16 - for( i = 0; i < 16; i++) { + #pragma unroll + for(int i=0; i < 16; i++) { m[i] = cuda_swab64(block[i]); } - #pragma unroll 8 - for (i = 0; i < 8; i++) + //#pragma unroll 8 + for(int i=0; i < 8; i++) v[i] = h[i]; v[ 8] = u512[0]; @@ -83,7 +84,7 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t v[15] = u512[7]; //#pragma unroll 16 - for( i = 0; i < 16; ++i ) + for(int i=0; i < 16; i++) { /* column step */ G( 0, 4, 8, 12, 0 ); @@ -97,9 +98,14 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t G( 3, 4, 9, 14, 14 ); } - #pragma unroll 16 - for( i = 0; i < 16; ++i ) - h[i % 8] ^= v[i]; + h[0] ^= v[0] ^ v[8]; + h[1] ^= v[1] ^ v[9]; + h[2] ^= v[2] ^ v[10]; + h[3] ^= v[3] ^ v[11]; + h[4] ^= v[4] ^ v[12]; + h[5] ^= v[5] ^ v[13]; + h[6] ^= v[6] ^ v[14]; + h[7] ^= v[7] ^ v[15]; } // Hash-Padding diff --git a/x11/x11.cu b/x11/x11.cu index 9baed10..45afb52 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -25,7 +25,6 @@ static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/x13/x13.cu b/x13/x13.cu index 31a55a5..cfb8d93 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -29,7 +29,6 @@ static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/x15/x14.cu b/x15/x14.cu index 99c62fd..7b8121a 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -32,7 +32,6 @@ static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/x15/x15.cu b/x15/x15.cu index 23f6aa8..75e83f7 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -33,7 +33,6 @@ static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/x17/x17.cu b/x17/x17.cu index 28a382f..3fbc604 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -36,7 +36,6 @@ static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);