diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index 5ce26f2..afa9b3b 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -15,7 +15,7 @@ static __device__ uint64_t cuda_swab32ll(uint64_t x) { return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); } -__constant__ static uint32_t c_data[20]; +__constant__ static uint32_t c_data[3+1]; __constant__ static uint32_t sigma[16][16]; static uint32_t c_sigma[16][16] = { @@ -195,17 +195,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - const uint32_t nonce = startNonce + thread; uint32_t h[8]; uint32_t input[4]; - #pragma unroll 8 - for (int i = 0; i<8; i++) { h[i] = cpu_h[i];} + #pragma unroll + for (int i = 0; i < 8; i++) h[i] = cpu_h[i]; - #pragma unroll 3 - for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i]; + #pragma unroll + for (int i = 0; i < 3; ++i) input[i] = c_data[i]; - input[3] = nonce; + input[3] = startNonce + thread; blake256_compress2nd(h, input, 640); #pragma unroll @@ -230,16 +229,14 @@ void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32 __host__ void blake256_cpu_setBlock_80(uint32_t *pdata) { - uint32_t h[8]; - uint32_t data[20]; + uint32_t h[8], data[20]; + memcpy(data, pdata, 80); - for (int i = 0; i<8; i++) { - h[i] = c_IV256[i]; - } + memcpy(h, c_IV256, sizeof(c_IV256)); blake256_compress1st(h, pdata, 512); cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice); } __host__