diff --git a/gost/cuda_gosthash.cu b/gost/cuda_gosthash.cu index becdddf..3846493 100644 --- a/gost/cuda_gosthash.cu +++ b/gost/cuda_gosthash.cu @@ -577,6 +577,14 @@ void GOST_Copy512(uint64_t* dst, uint64_t* const __restrict__ src) dst[i] = src[i]; } +__device__ __forceinline__ +void GOST_Copy256(uint64_t* dst, uint64_t* const __restrict__ src) +{ + #pragma unroll + for (int i=0; i<4; i++) + dst[i] = src[i]; +} + __device__ __forceinline__ void GOST_Xor512(uint64_t* C, uint64_t* const A, const uint64_t* B) { @@ -1058,21 +1066,21 @@ void GOST_hash_X(uint64_t *hash, uchar * const message, uint64_t len) __global__ __launch_bounds__(128, 3) -void streebog_gpu_hash_64(uint32_t threads, uint64_t *g_hash) +void streebog_gpu_hash_64(uint32_t threads, uint64_t *g_hash) // 80 bytes input { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint64_t* inout = (&g_hash[thread * 8U]); uint64_t hash[8] = { 0 }; //iv - GOST_hash_X(hash, (uchar*) inout, 512); + GOST_hash_X(hash, (uchar*) inout, 640); GOST_Copy512(inout, hash); } } __global__ __launch_bounds__(128, 3) -void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) +void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) // 64 bytes input { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -1081,12 +1089,12 @@ void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) uint64_t hash[8]; memset (&hash, 1, 64); // iv GOST_hash_X(hash, (uchar*) inout, 512); - GOST_Copy512(inout, hash); + GOST_Copy256(inout, hash); } } __host__ -void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) +void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) { const int threadsperblock = 128; dim3 grid((threads + threadsperblock-1) / threadsperblock); @@ -1096,7 +1104,7 @@ void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t * } __host__ -void gost_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) +void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash) { const int threadsperblock = 128; dim3 grid((threads + threadsperblock-1) / threadsperblock); diff --git a/gost/gost.cu b/gost/gost.cu index f145e3d..273ff6c 100644 --- a/gost/gost.cu +++ b/gost/gost.cu @@ -32,8 +32,8 @@ extern "C" void gosthash(void *output, const void *input) memcpy(output, hash, 32); } -extern void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void gost_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash); //#define _DEBUG #define _DEBUG_PREFIX "sib" @@ -66,24 +66,23 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce, } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 80 * throughput), -1); cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } - uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], pdata[k]); + be32enc(&d_hash[thr_id][k], pdata[k]); do { int order = 0; // Hash with CUDA - gost_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + gost_hash_64(thr_id, throughput, d_hash[thr_id]); TRACE("gost64 :"); - gost_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + gost_hash_32(thr_id, throughput, d_hash[thr_id]); TRACE("gost32 :"); work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); @@ -91,8 +90,8 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce, { const uint32_t Htarg = ptarget[7]; uint32_t _ALIGN(64) vhash[8]; - be32enc(&endiandata[19], work->nonces[0]); - gosthash(vhash, endiandata); + be32enc(&d_hash[thr_id][19], work->nonces[0]); + gosthash(vhash, d_hash[thr_id]); if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; @@ -100,8 +99,8 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce, work->nonces[1] =cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); *hashes_done = pdata[19] - first_nonce + throughput; if (work->nonces[1] != 0) { - be32enc(&endiandata[19], work->nonces[1]); - sibhash(vhash, endiandata); + be32enc(&d_hash[thr_id][19], work->nonces[1]); + sibhash(vhash, d_hash[thr_id]); bn_set_target_ratio(work, vhash, 1); work->valid_nonces++; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;