x10 funcs cleanup, we dont need host constant tables

11 years ago · cf7351d138
6 changed files with 85 additions and 118 deletions
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@ -2,7 +2,8 @@
				@@ -2,7 +2,8 @@
 /* AES Helper for inline-usage from SPH */
 #define AESx(x) SPH_C32(x)

-static const uint32_t h_AES0[256] = {
+__device__ __constant__
+static const uint32_t d_AES0[256] = {
 	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
 	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
 	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@ -69,7 +70,8 @@ static const uint32_t h_AES0[256] = {
				@@ -69,7 +70,8 @@ static const uint32_t h_AES0[256] = {
 	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
 };

-static const uint32_t h_AES1[256] = {
+__device__ __constant__
+static const uint32_t d_AES1[256] = {
 	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
 	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
 	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@ -136,7 +138,8 @@ static const uint32_t h_AES1[256] = {
				@@ -136,7 +138,8 @@ static const uint32_t h_AES1[256] = {
 	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
 };

-static const uint32_t h_AES2[256] = {
+__device__ __constant__
+static const uint32_t d_AES2[256] = {
 	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
 	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
 	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@ -203,7 +206,8 @@ static const uint32_t h_AES2[256] = {
				@@ -203,7 +206,8 @@ static const uint32_t h_AES2[256] = {
 	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
 };

-static const uint32_t h_AES3[256] = {
+__device__ __constant__
+static const uint32_t d_AES3[256] = {
 	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
 	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
 	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
@ -270,35 +274,12 @@ static const uint32_t h_AES3[256] = {
				@@ -270,35 +274,12 @@ static const uint32_t h_AES3[256] = {
 	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
 };

-static __constant__ uint32_t d_AES0[256];
-static __constant__ uint32_t d_AES1[256];
-static __constant__ uint32_t d_AES2[256];
-static __constant__ uint32_t d_AES3[256];
-
 static void aes_cpu_init()
 {
-	cudaMemcpyToSymbol( d_AES0,
-                        h_AES0,
-                        sizeof(h_AES0),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES1,
-                        h_AES1,
-                        sizeof(h_AES1),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES2,
-                        h_AES2,
-                        sizeof(h_AES2),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES3,
-                        h_AES3,
-                        sizeof(h_AES3),
-                        0, cudaMemcpyHostToDevice);
 }

-static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
+__device__ __forceinline__
+static void aes_gpu_init(uint32_t *sharedMemory)
 {
 	if(threadIdx.x < 256)
 	{
@ -309,7 +290,8 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
				@@ -309,7 +290,8 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
 	}
 }

-static __device__ __forceinline__ void aes_round(
+__device__ __forceinline__
+static void aes_round(
 	const uint32_t *sharedMemory,
 	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
 	uint32_t k0, 
@ -356,7 +338,8 @@ static __device__ __forceinline__ void aes_round(
				@@ -356,7 +338,8 @@ static __device__ __forceinline__ void aes_round(
 		sharedMemory[idx3]; // ^k3
 }

-static __device__ __forceinline__ void aes_round(
+__device__ __forceinline__
+static void aes_round(
 	const uint32_t *sharedMemory,
 	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -29,8 +29,8 @@ typedef unsigned int uint32_t; /* must be exactly 32 bits */
				@@ -29,8 +29,8 @@ typedef unsigned int uint32_t; /* must be exactly 32 bits */
 #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

-__constant__ uint32_t c_IV_512[32];
-static const uint32_t h_IV_512[32] = {
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
 	0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
 	0x4167D83E, 0x3FEE2313, 0xC701CF8C,
 	0xCC39968E, 0x50AC5695, 0x4D42C787,
@ -221,7 +221,7 @@ void __device__ Init(uint32_t x[2][2][2][2][2])
				@@ -221,7 +221,7 @@ void __device__ Init(uint32_t x[2][2][2][2][2])
    /* "the state is then transformed invertibly through 10r identical rounds */
    for (i = 0;i < 10;++i) rrounds(x);
 #else
-    uint32_t *iv = c_IV_512;
+    const uint32_t *iv = c_IV_512;

 #pragma unroll 2
    for (i = 0;i < 2;++i)
@ -297,7 +297,6 @@ __global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, u
				@@ -297,7 +297,6 @@ __global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, u
 // Setup-Funktionen
 __host__ void x11_cubehash512_cpu_init(int thr_id, int threads)
 {
-    cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
 }

 __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@ -34,8 +34,8 @@ typedef struct {
				@@ -34,8 +34,8 @@ typedef struct {
    uint32_t chainv[40];   /* Chaining values */
 } hashState;

-
-static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
+ __device__ __forceinline__
+static uint32_t BYTES_SWAP32(uint32_t x)
 {
 	return __byte_perm(x, x, 0x0123);
 }
@ -100,8 +100,8 @@ static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
				@@ -100,8 +100,8 @@ static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
    b0 ^= c1;

 /* initial values of chaining variables */
-__constant__ uint32_t c_IV[40];
-const uint32_t h_IV[40] = {
+__device__ __constant__
+const uint32_t c_IV[40] = {
    0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
    0x6e292011,0x90152df4,0xee058139,0xdef610bb,
    0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
@ -113,8 +113,8 @@ const uint32_t h_IV[40] = {
				@@ -113,8 +113,8 @@ const uint32_t h_IV[40] = {
    0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
    0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};

-__constant__ uint32_t c_CNS[80];
-uint32_t h_CNS[80] = {
+__device__ __constant__
+uint32_t c_CNS[80] = {
    0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
    0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
    0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
@ -138,7 +138,8 @@ uint32_t h_CNS[80] = {
				@@ -138,7 +138,8 @@ uint32_t h_CNS[80] = {


 /***************************************************/
-__device__ __forceinline__ void rnd512(hashState *state)
+__device__ __forceinline__
+void rnd512(hashState *state)
 {
    int i,j;
    uint32_t t[40];
@ -284,7 +285,8 @@ __device__ __forceinline__ void rnd512(hashState *state)
				@@ -284,7 +285,8 @@ __device__ __forceinline__ void rnd512(hashState *state)
 }


-__device__ __forceinline__ void Update512(hashState *state, const BitSequence *data) 
+__device__ __forceinline__
+void Update512(hashState *state, const BitSequence *data)
 {
 #pragma unroll 8
    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
@ -297,7 +299,8 @@ __device__ __forceinline__ void Update512(hashState *state, const BitSequence *d
				@@ -297,7 +299,8 @@ __device__ __forceinline__ void Update512(hashState *state, const BitSequence *d


 /***************************************************/
-__device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
+__device__ __forceinline__
+void finalization512(hashState *state, uint32_t *b)
 {
    int i,j;

@ -363,8 +366,6 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -363,8 +366,6 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
 // Setup-Funktionen
 __host__ void x11_luffa512_cpu_init(int thr_id, int threads)
 {
-    cudaMemcpyToSymbol( c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice );
-    cudaMemcpyToSymbol( c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice );
 }

 __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -30,16 +30,16 @@ texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
				@@ -30,16 +30,16 @@ texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #endif

-__constant__  uint32_t c_IV_512[32];
-const uint32_t h_IV_512[32] = {
+__device__ __constant__
+const uint32_t c_IV_512[32] = {
  0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
  0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
  0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
  0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
 };

- __constant__ int c_FFT128_8_16_Twiddle[128];
- static const int h_FFT128_8_16_Twiddle[128] = {
+__device__ __constant__
+static const int c_FFT128_8_16_Twiddle[128] = {
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -49,9 +49,8 @@ const uint32_t h_IV_512[32] = {
				@@ -49,9 +49,8 @@ const uint32_t h_IV_512[32] = {
 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};

-
-__constant__ int c_FFT256_2_128_Twiddle[128];
-static const int h_FFT256_2_128_Twiddle[128] = {
+__device__ __constant__
+static const int c_FFT256_2_128_Twiddle[128] = {
   1, 41, -118, 45, 46, 87, -31, 14,
  60, -110, 116, -127, -67, 80, -61, 69,
   2, 82, 21, 90, 92, -83, -62, 28,
@ -658,28 +657,12 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
				@@ -658,28 +657,12 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
    cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads );
    cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads );

-#if 1
    // Textur für 128 Bit Zugriffe
    cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
    texRef1D_128.normalized = 0;
    texRef1D_128.filterMode = cudaFilterModePoint;
    texRef1D_128.addressMode[0] = cudaAddressModeClamp;
    cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
-#endif
-
-    cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
-    cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
-    cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
-
-
-	// CH
-	cudaMemcpyToSymbol( d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);	
-	cudaMemcpyToSymbol( d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);	
-	cudaMemcpyToSymbol( d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
-
-//    cudaFuncSetCacheConfig(x11_simd512_gpu_compress1_64, cudaFuncCachePreferL1);
-//    cudaFuncSetCacheConfig(x11_simd512_gpu_compress2_64, cudaFuncCachePreferL1);
 }

 __host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/x11/simd_functions.cu
+++ b/x11/simd_functions.cu
@ -898,7 +898,9 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
				@@ -898,7 +898,9 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -928,7 +930,9 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con
				@@ -928,7 +930,9 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -958,7 +962,9 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con
				@@ -958,7 +962,9 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -988,7 +994,9 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons
				@@ -988,7 +994,9 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -1018,7 +1026,9 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons
				@@ -1018,7 +1026,9 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -1048,7 +1058,9 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons
				@@ -1048,7 +1058,9 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons
 		A[j] = R[j];
 	}
 }
-__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+
+__device__ __forceinline__
+void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	int j;
 	uint32_t temp;
@ -1078,8 +1090,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
				@@ -1078,8 +1090,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
 		A[j] = R[j];
 	}
 }
-static __constant__ uint32_t d_cw0[8][8];
-static const uint32_t h_cw0[8][8] = {
+
+__device__ __constant__
+static const uint32_t d_cw0[8][8] = {
 	0x531B1720, 	0xAC2CDE09, 	0x0B902D87, 	0x2369B1F4, 	0x2931AA01, 	0x02E4B082, 	0xC914C914, 	0xC1DAE1A6, 
 	0xF18C2B5C, 	0x08AC306B, 	0x27BFC914, 	0xCEDC548D, 	0xC630C4BE, 	0xF18C4335, 	0xF0D3427C, 	0xBE3DA380, 
 	0x143C02E4, 	0xA948C630, 	0xA4F2DE09, 	0xA71D2085, 	0xA439BD84, 	0x109FCD6A, 	0xEEA8EF61, 	0xA5AB1CE8, 
@ -1089,10 +1102,8 @@ static const uint32_t h_cw0[8][8] = {
				@@ -1089,10 +1102,8 @@ static const uint32_t h_cw0[8][8] = {
 	0x213E50F0, 	0x39173EDF, 	0xA9485B0E, 	0xEEA82EF9, 	0x14F55771, 	0xFAF15546, 	0x3D6DD9B3, 	0xAB73B92E, 
 	0x582A48FD, 	0xEEA81892, 	0x4F7EAA01, 	0xAF10A88F, 	0x11581720, 	0x34C124DB, 	0xD1C0AB73, 	0x1E5AF0D3  
 };
-__device__ __forceinline__ void Round8_0_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
+__device__ __forceinline__
+void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
 	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1102,8 +1113,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
				@@ -1102,8 +1113,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
 	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw1[8][8];
-static const uint32_t h_cw1[8][8] = {
+
+__device__ __constant__
+static const uint32_t d_cw1[8][8] = {
 	0xC34C07F3, 	0xC914143C, 	0x599CBC12, 	0xBCCBE543, 	0x385EF3B7, 	0x14F54C9A, 	0x0AD7C068, 	0xB64A21F7, 
 	0xDEC2AF10, 	0xC6E9C121, 	0x56B8A4F2, 	0x1158D107, 	0xEB0BA88F, 	0x050FAABA, 	0xC293264D, 	0x548D46D2, 
 	0xACE5E8E0, 	0x53D421F7, 	0xF470D279, 	0xDC974E0C, 	0xD6CF55FF, 	0xFD1C4F7E, 	0x36EC36EC, 	0x3E261E5A, 
@ -1113,10 +1125,8 @@ static const uint32_t h_cw1[8][8] = {
				@@ -1113,10 +1125,8 @@ static const uint32_t h_cw1[8][8] = {
 	0xF4702B5C, 	0xC293FC63, 	0xDA6CB2AD, 	0x45601FCC, 	0xA439E1A6, 	0x4E0C0D02, 	0xED3621F7, 	0xAB73BE3D, 
 	0x0E74D4A4, 	0xF754CF95, 	0xD84136EC, 	0x3124AB73, 	0x39D03B42, 	0x0E74BCCB, 	0x0F2DBD84, 	0x41C35C80  
 };
-__device__ __forceinline__ void Round8_1_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
+__device__ __forceinline__
+void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
 	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1126,8 +1136,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
				@@ -1126,8 +1136,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
 	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw2[8][8];
-static const uint32_t h_cw2[8][8] = {
+
+__device__ __constant__
+static const uint32_t d_cw2[8][8] = {
 	0xA4135BED, 	0xE10E1EF2, 	0x6C4F93B1, 	0x6E2191DF, 	0xE2E01D20, 	0xD1952E6B, 	0x6A7D9583, 	0x131DECE3, 
 	0x369CC964, 	0xFB73048D, 	0x9E9D6163, 	0x280CD7F4, 	0xD9C6263A, 	0x1062EF9E, 	0x2AC7D539, 	0xAD2D52D3, 
 	0x0A03F5FD, 	0x197CE684, 	0xAA72558E, 	0xDE5321AD, 	0xF0870F79, 	0x607A9F86, 	0xAFE85018, 	0x2AC7D539, 
@ -1137,10 +1148,8 @@ static const uint32_t h_cw2[8][8] = {
				@@ -1137,10 +1148,8 @@ static const uint32_t h_cw2[8][8] = {
 	0xFC5C03A4, 	0x48D0B730, 	0x2AC7D539, 	0xD70B28F5, 	0x53BCAC44, 	0x3FB6C04A, 	0x14EFEB11, 	0xDB982468, 
 	0x9A1065F0, 	0xB0D14F2F, 	0x8D5272AE, 	0xC4D73B29, 	0x91DF6E21, 	0x949A6B66, 	0x303DCFC3, 	0x5932A6CE  
 };
-__device__ __forceinline__ void Round8_2_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
+__device__ __forceinline__
+void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
 	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1150,8 +1159,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
				@@ -1150,8 +1159,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
 	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw3[8][8];
-static const uint32_t h_cw3[8][8] = {
+
+__device__ __constant__
+static const uint32_t d_cw3[8][8] = {
 	0x1234EDCC, 	0xF5140AEC, 	0xCDF1320F, 	0x3DE4C21C, 	0x48D0B730, 	0x1234EDCC, 	0x131DECE3, 	0x52D3AD2D, 
 	0xE684197C, 	0x6D3892C8, 	0x72AE8D52, 	0x6FF3900D, 	0x73978C69, 	0xEB1114EF, 	0x15D8EA28, 	0x71C58E3B, 
 	0x90F66F0A, 	0x15D8EA28, 	0x9BE2641E, 	0x65F09A10, 	0xEA2815D8, 	0xBD8F4271, 	0x3A40C5C0, 	0xD9C6263A, 
@ -1161,10 +1171,8 @@ static const uint32_t h_cw3[8][8] = {
				@@ -1161,10 +1171,8 @@ static const uint32_t h_cw3[8][8] = {
 	0x975568AB, 	0x6994966C, 	0xF1700E90, 	0xD3672C99, 	0xCC1F33E1, 	0xFC5C03A4, 	0x452CBAD4, 	0x4E46B1BA, 
 	0xF1700E90, 	0xB2A34D5D, 	0xD0AC2F54, 	0x5760A8A0, 	0x8C697397, 	0x624C9DB4, 	0xE85617AA, 	0x95836A7D  
 };
-__device__ __forceinline__ void Round8_3_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
+__device__ __forceinline__
+void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
 	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1182,8 +1190,8 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A,
				@@ -1182,8 +1190,8 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A,
 #define expanded_vector(x) __ldg(&g_fft4[x])
 #endif

-__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
-		int r, int s, int t, int u, uint4 *g_fft4) {
+__device__ __forceinline__
+void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) {
 	uint32_t w[8];
    uint4 hv1, hv2;

--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -37,21 +37,19 @@
				@@ -37,21 +37,19 @@
 * @author   phm <phm@inbox.com>
 */

+#include <stdint.h>
+#include <cuda_runtime.h>
+
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 #define SPH_C64(x)    ((uint64_t)(x ## ULL))
 #define SPH_C32(x)    ((uint32_t)(x ## U))
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))

 #define SWAB32(x) ( __byte_perm(x, x, 0x0123) )

-#if __CUDA_ARCH__ < 350 
+#if __CUDA_ARCH__ < 350
    // Kepler (Compute 3.0)
    #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #else
@ -59,11 +57,8 @@ typedef unsigned long long uint64_t;
				@@ -59,11 +57,8 @@ typedef unsigned long long uint64_t;
    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #endif

-static __constant__ uint32_t d_alpha_n[32];
-static __constant__ uint32_t d_alpha_f[32];
-static __constant__ uint32_t d_T512[64][16];
-
-static const uint32_t alpha_n[] = {
+__device__ __constant__
+static const uint32_t d_alpha_n[] = {
 	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
 	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
@ -77,7 +72,8 @@ static const uint32_t alpha_n[] = {
				@@ -77,7 +72,8 @@ static const uint32_t alpha_n[] = {
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
 };

-static const uint32_t alpha_f[] = {
+__device__ __constant__
+static const uint32_t d_alpha_f[] = {
 	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
 	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
@ -260,8 +256,8 @@ static const uint32_t alpha_f[] = {
				@@ -260,8 +256,8 @@ static const uint32_t alpha_f[] = {
 		c0 = (h[0x0] ^= hamsi_s00); \
 	}

-
-static const uint32_t T512[64][16] = {
+__device__ __constant__
+static const uint32_t d_T512[64][16] = {
 	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
 	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
 	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
@ -740,9 +736,6 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -740,9 +736,6 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint

 __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
 {
-	cudaMemcpyToSymbol( d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_T512, T512, sizeof(uint32_t)*64*16, 0, cudaMemcpyHostToDevice);
 }

 __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)