diff --git a/gost/cuda_gosthash.cu b/gost/cuda_gosthash.cu index 3846493..8afacd9 100644 --- a/gost/cuda_gosthash.cu +++ b/gost/cuda_gosthash.cu @@ -14,6 +14,10 @@ typedef unsigned char uchar; +static uint32_t* d_resNonces[MAX_GPUS] = { 0 }; +__constant__ static uint32_t __align__(8) c_header[19]; +__constant__ static uint32_t __align__(8) c_target[2]; // up to 64 bits + //#define FULL_UNROLL #define memcpy(dst, src, len) { \ @@ -1093,6 +1097,37 @@ void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) // 64 bytes input } } +__global__ +/*__launch_bounds__(256,3)*/ +void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint32_t dat[20]; + #pragma unroll + for (int i = 0; i < 19; i++) dat[i] = c_header[i]; + dat[19] = nonce; + uint64_t hash1[8] = { 0 }; //iv for 512 + GOST_hash_X(hash1, (uchar*)dat, 640); // 80 bytes + uint64_t hash[8]; + #pragma unroll + for (int i=0; i<64; i++) ((uchar *)hash)[i] = 1; // iv for 256 + GOST_hash_X(hash, (uchar *)hash1, 512); // 64 bytes + // result is first 32 bytes of hash + + // check nonce + uint64_t high = MAKE_ULONGLONG(cuda_swab32(_LODWORD(hash[3])), cuda_swab32(_HIDWORD(hash[3]))); // swab uint64_t + if (high <= c_target[0]) + { + //printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]); + resNonces[1] = atomicExch(resNonces, nonce); + //d_target[0] = high; + } + } +} + __host__ void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) { @@ -1112,3 +1147,46 @@ void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash) streebog_gpu_hash_32<<>>(threads, (uint64_t*)d_hash); } + +__host__ +void gostd_init(int thr_id) +{ + cuda_get_arch(thr_id); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t))); +} + +__host__ +void gostd_free(int thr_id) +{ + if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]); + d_resNonces[thr_id] = NULL; +} + +__host__ +void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget) +{ + uint32_t buf[19]; + for (int i=0;i<19;i++) buf[i] = cuda_swab32(pdata[i]); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_header, buf, 76, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); +} + +__host__ +void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) +{ + const uint32_t threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t))); + cudaThreadSynchronize(); + gostd_gpu_hash_80 <<>> (threads, startNonce, d_resNonces[thr_id]); + cudaThreadSynchronize(); + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if (resNonces[0] == resNonces[1]) { + resNonces[1] = UINT32_MAX; + } +}