diff --git a/Algo256/decred.cu b/Algo256/decred.cu index fe6092c..8381f08 100644 --- a/Algo256/decred.cu +++ b/Algo256/decred.cu @@ -144,7 +144,7 @@ uint32_t blake256_compress_14(uint32_t *m, uint32_t *v_init, uint32_t d_data6, u // round 7 GSPREC4(0, 4, 0x8, 0xC, 12, 5, 1, 5, 0x9, 0xD, 1, 15, 2, 6, 0xA, 0xE, 14,13, 3, 7, 0xB, 0xF, 4, 10); GSPREC4(0, 5, 0xA, 0xF, 0, 7, 1, 6, 0xB, 0xC, 6, 3, 2, 7, 0x8, 0xD, 9, 2, 3, 4, 0x9, 0xE, 8, 11); - /* +#ifdef FULL_4WAY // round 8 GSPREC4(0, 4, 0x8, 0xC, 13,11, 1, 5, 0x9, 0xD, 7, 14, 2, 6, 0xA, 0xE, 12, 1, 3, 7, 0xB, 0xF, 3, 9); GSPREC4(0, 5, 0xA, 0xF, 5, 0, 1, 6, 0xB, 0xC, 15, 4, 2, 7, 0x8, 0xD, 8, 6, 3, 4, 0x9, 0xE, 2, 10); @@ -163,7 +163,7 @@ uint32_t blake256_compress_14(uint32_t *m, uint32_t *v_init, uint32_t d_data6, u // round 13 GSPREC4(0, 4, 0x8, 0xC, 11, 8, 1, 5, 0x9, 0xD, 12, 0, 2, 6, 0xA, 0xE, 5, 2, 3, 7, 0xB, 0xF, 15,13); GSPREC4(0, 5, 0xA, 0xF, 10,14, 1, 6, 0xB, 0xC, 3, 6, 2, 7, 0x8, 0xD, 7, 1, 3, 4, 0x9, 0xE, 9, 4); - */ +#else // round 8 GSPREC(0, 4, 0x8, 0xC, 13,11); GSPREC(1, 5, 0x9, 0xD, 7, 14); @@ -218,6 +218,7 @@ uint32_t blake256_compress_14(uint32_t *m, uint32_t *v_init, uint32_t d_data6, u GSPREC(1, 6, 0xB, 0xC, 3, 6); GSPREC(2, 7, 0x8, 0xD, 7, 1); GSPREC(3, 4, 0x9, 0xE, 9, 4); +#endif // round 14 GSPREC(0, 4, 0x8, 0xC, 7, 9); GSPREC(1, 5, 0x9, 0xD, 3, 1); @@ -279,12 +280,17 @@ void blake256_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, #else resNonce[0] = m[3]; #endif + // from alexis78: + // return statement allows CUDA7.5 to : + // 1. Store the values fetched from constant memory in registers. + // 2. Perform more precomputations on the outside of the for loop. + // 3. Stop the continuous fetches from the constant memory while iterating + return; } } } } - __host__ static uint32_t decred_cpu_hash_nonce(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget) {