lyra2: reduce blake message len.

10 years ago · e95712a2ea
1 changed files with 10 additions and 13 deletions
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@ -15,7 +15,7 @@ static __device__ uint64_t cuda_swab32ll(uint64_t x) {
				@@ -15,7 +15,7 @@ static __device__ uint64_t cuda_swab32ll(uint64_t x) {
 	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
 }

-__constant__ static uint32_t  c_data[20];
+__constant__ static uint32_t c_data[3+1];

 __constant__ static uint32_t sigma[16][16];
 static uint32_t  c_sigma[16][16] = {
@ -195,17 +195,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
				@@ -195,17 +195,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint32_t nonce = startNonce + thread;
 		uint32_t h[8];
 		uint32_t input[4];

-		#pragma unroll 8
-		for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
+		#pragma unroll
+		for (int i = 0; i < 8; i++) h[i] = cpu_h[i];

-		#pragma unroll 3
-		for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
+		#pragma unroll
+		for (int i = 0; i < 3; ++i) input[i] = c_data[i];

-		input[3] = nonce;
+		input[3] = startNonce + thread;
 		blake256_compress2nd(h, input, 640);

        #pragma unroll
@ -230,16 +229,14 @@ void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32
				@@ -230,16 +229,14 @@ void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32
 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata)
 {
-	uint32_t h[8];
-	uint32_t data[20];
+	uint32_t h[8], data[20];
+
 	memcpy(data, pdata, 80);
-	for (int i = 0; i<8; i++) {
-		h[i] = c_IV256[i];
-	}
+	memcpy(h, c_IV256, sizeof(c_IV256));
 	blake256_compress1st(h, pdata, 512);

 	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice);
 }

 __host__