Faster Simd

On maxwell compress1 and compress2 can be run in one run instead of two.(750TI + 20KHASH)
2014-11-20 23:01:03 +01:00 · 2014-11-20 23:01:03 +01:00 · 7d88e5cca1
commit 7d88e5cca1
parent 7ce3fed611
2 changed files with 116 additions and 135 deletions
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -8,16 +8,16 @@
 #define TPB 64

 #include "cuda_helper.h"
-#include <stdio.h>
+//#include <stdio.h>

-int *d_state[8];
+uint32_t *d_state[8];
 uint4 *d_temp4[8];

 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;

-__constant__ uint32_t c_perm[8][8];
-const uint32_t h_perm[8][8] = {
+__constant__ uint8_t c_perm[8][8];
+const uint8_t h_perm[8][8] = {
 	{ 2, 3, 6, 7, 0, 1, 4, 5 },
 	{ 6, 7, 2, 3, 4, 5, 0, 1 },
 	{ 7, 6, 5, 4, 3, 2, 1, 0 },
@ -36,8 +36,8 @@ const uint32_t h_IV_512[32] = {
 	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
 };

-__constant__ int c_FFT128_8_16_Twiddle[128];
-static const int h_FFT128_8_16_Twiddle[128] = {
+__constant__ short c_FFT128_8_16_Twiddle[128];
+static const short h_FFT128_8_16_Twiddle[128] = {
 	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
 	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
 	1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
@ -48,8 +48,8 @@ static const int h_FFT128_8_16_Twiddle[128] = {
 	1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
 };

-__constant__ int c_FFT256_2_128_Twiddle[128];
-static const int h_FFT256_2_128_Twiddle[128] = {
+__constant__ short c_FFT256_2_128_Twiddle[128];
+static const short h_FFT256_2_128_Twiddle[128] = {
 	  1,  41,-118,  45,  46,  87, -31,  14,
 	 60,-110, 116,-127, -67,  80, -61,  69,
 	  2,  82,  21,  90,  92, -83, -62,  28,
@ -71,9 +71,10 @@ static const int h_FFT256_2_128_Twiddle[128] = {

 /************* the round function ****************/

-#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))

-#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
+#define IF(x, y, z) (((y ^ z) & x) ^ z)
+#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
+

 #include "x11/simd_functions.cu"

@ -549,8 +550,7 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
 }

 /***************************************************/
-
-__global__ void __launch_bounds__(TPB,4)
+__global__ void __launch_bounds__(TPB, 8)
 x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
@ -576,8 +576,9 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
 	}
 }

-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+/*
+__global__ void __launch_bounds__(TPB, 4)
+x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -590,9 +591,8 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash
 		Compression1(Hash, hashPosition, g_fft4, g_state);
 	}
 }
-
-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+__global__ void __launch_bounds__(TPB, 4)
+x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -604,9 +604,27 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
 		Compression2(hashPosition, g_fft4, g_state);
 	}
 }
+*/

-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+__global__ void __launch_bounds__(TPB, 4)
+x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		Compression1(Hash, hashPosition, g_fft4, g_state);
+		Compression2(hashPosition, g_fft4, g_state);
+	}
+}
+
+
+__global__ void  __launch_bounds__(TPB, 4)
+x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -658,9 +676,7 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint

 	dim3 grid((threads + threadsperblock-1)/threadsperblock);

-	x11_simd512_gpu_compress1_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-	x11_simd512_gpu_compress2_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-
+	x11_simd512_gpu_compress_64_maxwell << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
 	x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);

 	MyStreamSynchronize(NULL, order, thr_id);
--- a/x11/simd_functions.cu
+++ b/x11/simd_functions.cu
@ -1,10 +1,9 @@
 __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for(int j=0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -24,17 +23,16 @@ __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for(int j=0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -54,17 +52,16 @@ __device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[1];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -84,17 +81,16 @@ __device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int  j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -114,17 +110,16 @@ __device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int  j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -144,17 +139,16 @@ __device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -174,17 +168,16 @@ __device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -204,17 +197,16 @@ __device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[3];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -234,17 +226,16 @@ __device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -264,17 +255,16 @@ __device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[1];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -294,17 +284,17 @@ __device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
+
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -324,17 +314,16 @@ __device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -354,17 +343,16 @@ __device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -384,17 +372,16 @@ __device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -414,17 +401,16 @@ __device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[3];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -444,17 +430,16 @@ __device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -474,17 +459,16 @@ __device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[1];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -504,17 +488,16 @@ __device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -534,17 +517,16 @@ __device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -564,17 +546,16 @@ __device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -594,17 +575,16 @@ __device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -624,17 +604,16 @@ __device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[3];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -654,17 +633,16 @@ __device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -684,17 +662,16 @@ __device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[1];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -714,17 +691,16 @@ __device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -744,17 +720,16 @@ __device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -774,17 +749,16 @@ __device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -804,17 +778,16 @@ __device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -834,17 +807,16 @@ __device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[3];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -864,17 +836,16 @@ __device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -894,17 +865,16 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[1];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -924,17 +894,16 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -954,17 +923,16 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con
 	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -984,17 +952,16 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1014,17 +981,16 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1044,17 +1010,16 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[3];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
 __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
-	int j;
 	uint32_t temp;
 	uint32_t R[8];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
 	}
 	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1074,7 +1039,7 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
 	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
 	D[7] = ROTL32(temp, s) + R[6];
 #pragma unroll 8
-	for(j=0; j<8; j++) {
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
 	}
 }
@ -1327,7 +1292,7 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, co
 	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
 }

-__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
+__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
 	uint32_t A[32];
 	int i;
 #pragma unroll 32
@ -1360,10 +1325,10 @@ __device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, ui
 	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
 }

-__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) {
+__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
 	uint32_t A[32];
 	int i;
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
 #pragma unroll 32
 	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
 	SIMD_Compress2(A, texture_id, g_fft4);
@ -1396,10 +1361,10 @@ __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t
 	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
 }

-__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
+__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
 	uint32_t A[32];
 	int i;
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
 #pragma unroll 32
 	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
 	uint32_t buffer[16];