simd: add support for SM 2.1 devices

Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
9 years ago · 0d9d3520ac
13 changed files with 696 additions and 61 deletions
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -89,7 +89,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
				@@ -89,7 +89,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000f;
+		ptarget[7] = 0x000f;

 	if (!init[thr_id])
 	{
@ -100,7 +100,6 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
				@@ -100,7 +100,6 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}

-
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));

 		jackpot_keccak512_cpu_init(thr_id, throughput);
--- a/bench.cpp
+++ b/bench.cpp
@ -100,16 +100,12 @@ bool bench_algo_switch_next(int thr_id)
				@@ -100,16 +100,12 @@ bool bench_algo_switch_next(int thr_id)

 	if (device_sm[dev_id] && device_sm[dev_id] < 300) {
 		// incompatible SM 2.1 kernels...
-		if (algo == ALGO_FRESH) algo++;
 		if (algo == ALGO_GROESTL) algo++;
 		if (algo == ALGO_MYR_GR) algo++;
-		if (algo == ALGO_JACKPOT) algo++;
+		if (algo == ALGO_JACKPOT) algo++; // compact shuffle
 		if (algo == ALGO_LYRA2v2) algo++;
 		if (algo == ALGO_NEOSCRYPT) algo++;
-		if (algo == ALGO_QUARK) algo++;
-		if (algo == ALGO_QUBIT) algo++;
-		if (algo == ALGO_S3) algo++; // to check...
-		while (algo >= ALGO_X11 && algo <= ALGO_X17) algo++;
+		if (algo == ALGO_QUARK) algo++; // todo
 		if (algo == ALGO_WHIRLPOOLX) algo++;
 	}
 	// and unwanted ones...
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -251,6 +251,7 @@
				@@ -251,6 +251,7 @@
    <ClCompile Include="neoscrypt\neoscrypt.cpp" />
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
    <ClInclude Include="neoscrypt\cuda_vectors.h" />
+    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
    <CudaCompile Include="Algo256\bmw.cu" />
    <CudaCompile Include="Algo256\cuda_bmw.cu">
      <MaxRegCount>76</MaxRegCount>
@ -463,9 +464,7 @@
				@@ -463,9 +464,7 @@
    <CudaCompile Include="cuda_skeincoin.cu">
      <MaxRegCount>48</MaxRegCount>
    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_aes.cu">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_aes.cuh" />
    <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
    <CudaCompile Include="x11\cuda_x11_echo.cu">
    </CudaCompile>
@ -476,15 +475,13 @@
				@@ -476,15 +475,13 @@
    <CudaCompile Include="x11\cuda_x11_shavite512.cu">
      <MaxRegCount>128</MaxRegCount>
    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_simd512_func.cuh" />
    <CudaCompile Include="x11\cuda_x11_simd512.cu">
      <MaxRegCount>64</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="x11\c11.cu" />
    <CudaCompile Include="x11\fresh.cu" />
    <CudaCompile Include="x11\s3.cu" />
-    <CudaCompile Include="x11\simd_functions.cu">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-    </CudaCompile>
    <CudaCompile Include="x11\x11.cu">
    </CudaCompile>
    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
@ -530,4 +527,4 @@
				@@ -530,4 +527,4 @@
  <Target Name="AfterClean">
    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
  </Target>
-</Project>
+</Project>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -443,6 +443,9 @@
				@@ -443,6 +443,9 @@
    <ClInclude Include="quark\cuda_bmw512_sm3.cuh">
      <Filter>Source Files\CUDA\quark</Filter>
    </ClInclude>
+    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -523,9 +526,9 @@
				@@ -523,9 +526,9 @@
    <CudaCompile Include="cuda_nist5.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_aes.cu">
+    <ClInclude Include="x11\cuda_x11_aes.cuh">
      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
+    </ClInclude>
    <CudaCompile Include="x11\cuda_x11_cubehash512.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -544,6 +547,9 @@
				@@ -544,6 +547,9 @@
    <CudaCompile Include="x11\cuda_x11_simd512.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_simd512_func.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
    <CudaCompile Include="x11\c11.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -553,9 +559,6 @@
				@@ -553,9 +559,6 @@
    <CudaCompile Include="x11\x11.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
-    <CudaCompile Include="x11\simd_functions.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
      <Filter>Source Files\CUDA\x13</Filter>
    </CudaCompile>
@ -707,4 +710,4 @@
				@@ -707,4 +710,4 @@
      <Filter>Ressources</Filter>
    </Text>
  </ItemGroup>
-</Project>
+</Project>
--- a/pools.cpp
+++ b/pools.cpp
@ -360,6 +360,7 @@ bool parse_pool_array(json_t *obj)
				@@ -360,6 +360,7 @@ bool parse_pool_array(json_t *obj)
 void pool_dump_infos()
 {
 	struct pool_infos *p;
+	if (opt_benchmark) return;
 	for (int i=0; i<num_pools; i++) {
 		p = &pools[i];
 		applog(LOG_DEBUG, "POOL %01d: %s USER %s -s %d", i,
--- a/x11/cuda_x11_aes.cuh
+++ b/x11/cuda_x11_aes.cuh
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -3,7 +3,7 @@
				@@ -3,7 +3,7 @@

 #include "cuda_helper.h"

-#include "cuda_x11_aes.cu"
+#include "cuda_x11_aes.cuh"

 __device__ __forceinline__ void AES_2ROUND(
 	const uint32_t* __restrict__ sharedMemory,
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -6,7 +6,7 @@
				@@ -6,7 +6,7 @@

 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)

-#include "cuda_x11_aes.cu"
+#include "cuda_x11_aes.cuh"

 __device__ __forceinline__
 static void AES_ROUND_NOKEY(
@ -1403,7 +1403,7 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
				@@ -1403,7 +1403,7 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
 	{
 		const uint32_t nounce = startNounce + thread;

-		// kopiere init-state
+		// initial state
 		uint32_t state[16] = {
 			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
 			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
@ -1441,6 +1441,7 @@ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
				@@ -1441,6 +1441,7 @@ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);

+	// note: 128 threads minimum are required to init the shared memory array
 	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
@ -1465,8 +1466,8 @@ void x11_shavite512_cpu_init(int thr_id, uint32_t threads)
				@@ -1465,8 +1466,8 @@ void x11_shavite512_cpu_init(int thr_id, uint32_t threads)
 __host__
 void x11_shavite512_setBlock_80(void *pdata)
 {
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
+	// Message with Padding
+	// The nonce is at Byte 76.
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -5,11 +5,11 @@
				@@ -5,11 +5,11 @@
 //
 // STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations

-#define TPB 128
-
 #include "miner.h"
 #include "cuda_helper.h"

+#define TPB 128
+
 uint32_t *d_state[MAX_GPUS];
 uint4 *d_temp4[MAX_GPUS];

@ -34,7 +34,7 @@ const uint8_t h_perm[8][8] = {
				@@ -34,7 +34,7 @@ const uint8_t h_perm[8][8] = {
 	{ 4, 5, 2, 3, 6, 7, 0, 1 }
 };

-/* for simd_functions.cu */
+/* for simd_functions.cuh */
 #ifdef DEVICE_DIRECT_CONSTANTS
 __constant__ uint32_t c_IV_512[32] = {
 #else
@ -87,15 +87,23 @@ static const short h_FFT256_2_128_Twiddle[128] = {
				@@ -87,15 +87,23 @@ static const short h_FFT256_2_128_Twiddle[128] = {
 	-30,  55, -58, -65, -95, -40, -98,  94
 };

+#include "cuda_x11_simd512_sm2.cuh"

-/************* the round function ****************/
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif

+/************* the round function ****************/

+#undef IF
+#undef MAJ
 #define IF(x, y, z) (((y ^ z) & x) ^ z)
 #define MAJ(x, y, z) ((z &y) | ((z|y) & x))

+#include "x11/cuda_x11_simd512_func.cuh"

-#include "x11/simd_functions.cu"
+#if __CUDA_ARCH__ >= 300

 /********************* Message expansion ************************/

@ -347,7 +355,6 @@ void FFT_256_halfzero(int y[256])
				@@ -347,7 +355,6 @@ void FFT_256_halfzero(int y[256])
 	FFT_128_full(y+16);
 }

-
 /***************************************************/

 __device__ __forceinline__
@ -643,10 +650,23 @@ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4,
				@@ -643,10 +650,23 @@ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4,
 	}
 }

+#else
+__global__ void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {}
+__global__ void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+#endif /* SM3+ */
+
 __host__
 int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 {
+	int dev_id = device_map[thr_id];
 	cuda_get_arch(thr_id);
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		x11_simd512_cpu_init_sm2(thr_id);
+		return 0;
+	}

 	CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
 	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
@ -656,10 +676,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
				@@ -656,10 +676,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 	cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
 #endif
+
+#if 0
 	cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
+#endif

 	// Texture for 128-Bit Zugriffe
 	cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
@ -675,27 +698,31 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
				@@ -675,27 +698,31 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 __host__
 void x11_simd512_cpu_free(int thr_id)
 {
-	cudaFree(d_temp4[thr_id]);
-	cudaFree(d_state[thr_id]);
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+		cudaFree(d_temp4[thr_id]);
+		cudaFree(d_state[thr_id]);
+	}
 }

 __host__
 void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = TPB;
+	int dev_id = device_map[thr_id];

 	dim3 block(threadsperblock);
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 gridX8(grid.x * 8);

-	if (d_nonceVector != NULL) {
-		applog(LOG_ERR, "Sorry, nonce Vector param was removed!");
+	if (d_nonceVector != NULL || device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		x11_simd512_cpu_hash_64_sm2(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
 		return;
 	}

 	x11_simd512_gpu_expand_64 <<<gridX8, block>>> (threads, d_hash, d_temp4[thr_id]);

-	if (device_sm[device_map[thr_id]] >= 500 && cuda_arch[device_map[thr_id]] >= 500) {
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) {
 		x11_simd512_gpu_compress_64_maxwell <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
 	} else {
 		x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
--- a/x11/cuda_x11_simd512_func.cuh
+++ b/x11/cuda_x11_simd512_func.cuh
@ -1,3 +1,5 @@
				@@ -1,3 +1,5 @@
+#define SIMD_FUNCTIONS_CUH
+
 __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
 	uint32_t temp;
@ -1043,8 +1045,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
				@@ -1043,8 +1045,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
 		A[j] = R[j];
 	}
 }
-static __constant__ uint32_t d_cw0[8][8];
-static const uint32_t h_cw0[8][8] = {
+
+static __constant__ uint32_t d_cw0[8][8] = {
+//static const uint32_t h_cw0[8][8] = {
 	0x531B1720, 	0xAC2CDE09, 	0x0B902D87, 	0x2369B1F4, 	0x2931AA01, 	0x02E4B082, 	0xC914C914, 	0xC1DAE1A6, 
 	0xF18C2B5C, 	0x08AC306B, 	0x27BFC914, 	0xCEDC548D, 	0xC630C4BE, 	0xF18C4335, 	0xF0D3427C, 	0xBE3DA380, 
 	0x143C02E4, 	0xA948C630, 	0xA4F2DE09, 	0xA71D2085, 	0xA439BD84, 	0x109FCD6A, 	0xEEA8EF61, 	0xA5AB1CE8, 
@ -1054,10 +1057,9 @@ static const uint32_t h_cw0[8][8] = {
				@@ -1054,10 +1057,9 @@ static const uint32_t h_cw0[8][8] = {
 	0x213E50F0, 	0x39173EDF, 	0xA9485B0E, 	0xEEA82EF9, 	0x14F55771, 	0xFAF15546, 	0x3D6DD9B3, 	0xAB73B92E, 
 	0x582A48FD, 	0xEEA81892, 	0x4F7EAA01, 	0xAF10A88F, 	0x11581720, 	0x34C124DB, 	0xD1C0AB73, 	0x1E5AF0D3  
 };
-__device__ __forceinline__ void Round8_0_final(uint32_t *A,
-		int r, int s, int t, int u) {
-

+__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
+{
 	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1067,8 +1069,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
				@@ -1067,8 +1069,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
 	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw1[8][8];
-static const uint32_t h_cw1[8][8] = {
+
+static __constant__ uint32_t d_cw1[8][8] = {
+//static const uint32_t h_cw1[8][8] = {
 	0xC34C07F3, 	0xC914143C, 	0x599CBC12, 	0xBCCBE543, 	0x385EF3B7, 	0x14F54C9A, 	0x0AD7C068, 	0xB64A21F7, 
 	0xDEC2AF10, 	0xC6E9C121, 	0x56B8A4F2, 	0x1158D107, 	0xEB0BA88F, 	0x050FAABA, 	0xC293264D, 	0x548D46D2, 
 	0xACE5E8E0, 	0x53D421F7, 	0xF470D279, 	0xDC974E0C, 	0xD6CF55FF, 	0xFD1C4F7E, 	0x36EC36EC, 	0x3E261E5A, 
@ -1078,10 +1081,9 @@ static const uint32_t h_cw1[8][8] = {
				@@ -1078,10 +1081,9 @@ static const uint32_t h_cw1[8][8] = {
 	0xF4702B5C, 	0xC293FC63, 	0xDA6CB2AD, 	0x45601FCC, 	0xA439E1A6, 	0x4E0C0D02, 	0xED3621F7, 	0xAB73BE3D, 
 	0x0E74D4A4, 	0xF754CF95, 	0xD84136EC, 	0x3124AB73, 	0x39D03B42, 	0x0E74BCCB, 	0x0F2DBD84, 	0x41C35C80  
 };
-__device__ __forceinline__ void Round8_1_final(uint32_t *A,
-		int r, int s, int t, int u) {
-

+__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
+{
 	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1091,8 +1093,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
				@@ -1091,8 +1093,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
 	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw2[8][8];
-static const uint32_t h_cw2[8][8] = {
+
+static __constant__ uint32_t d_cw2[8][8] = {
+//static const uint32_t h_cw2[8][8] = {
 	0xA4135BED, 	0xE10E1EF2, 	0x6C4F93B1, 	0x6E2191DF, 	0xE2E01D20, 	0xD1952E6B, 	0x6A7D9583, 	0x131DECE3, 
 	0x369CC964, 	0xFB73048D, 	0x9E9D6163, 	0x280CD7F4, 	0xD9C6263A, 	0x1062EF9E, 	0x2AC7D539, 	0xAD2D52D3, 
 	0x0A03F5FD, 	0x197CE684, 	0xAA72558E, 	0xDE5321AD, 	0xF0870F79, 	0x607A9F86, 	0xAFE85018, 	0x2AC7D539, 
@ -1102,10 +1105,9 @@ static const uint32_t h_cw2[8][8] = {
				@@ -1102,10 +1105,9 @@ static const uint32_t h_cw2[8][8] = {
 	0xFC5C03A4, 	0x48D0B730, 	0x2AC7D539, 	0xD70B28F5, 	0x53BCAC44, 	0x3FB6C04A, 	0x14EFEB11, 	0xDB982468, 
 	0x9A1065F0, 	0xB0D14F2F, 	0x8D5272AE, 	0xC4D73B29, 	0x91DF6E21, 	0x949A6B66, 	0x303DCFC3, 	0x5932A6CE  
 };
-__device__ __forceinline__ void Round8_2_final(uint32_t *A,
-		int r, int s, int t, int u) {
-

+__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
+{
 	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1115,8 +1117,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
				@@ -1115,8 +1117,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
 	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
 	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
 }
-static __constant__ uint32_t d_cw3[8][8];
-static const uint32_t h_cw3[8][8] = {
+
+static __constant__ uint32_t d_cw3[8][8] = {
+//static const uint32_t h_cw3[8][8] = {
 	0x1234EDCC, 	0xF5140AEC, 	0xCDF1320F, 	0x3DE4C21C, 	0x48D0B730, 	0x1234EDCC, 	0x131DECE3, 	0x52D3AD2D, 
 	0xE684197C, 	0x6D3892C8, 	0x72AE8D52, 	0x6FF3900D, 	0x73978C69, 	0xEB1114EF, 	0x15D8EA28, 	0x71C58E3B, 
 	0x90F66F0A, 	0x15D8EA28, 	0x9BE2641E, 	0x65F09A10, 	0xEA2815D8, 	0xBD8F4271, 	0x3A40C5C0, 	0xD9C6263A, 
@ -1126,10 +1129,9 @@ static const uint32_t h_cw3[8][8] = {
				@@ -1126,10 +1129,9 @@ static const uint32_t h_cw3[8][8] = {
 	0x975568AB, 	0x6994966C, 	0xF1700E90, 	0xD3672C99, 	0xCC1F33E1, 	0xFC5C03A4, 	0x452CBAD4, 	0x4E46B1BA, 
 	0xF1700E90, 	0xB2A34D5D, 	0xD0AC2F54, 	0x5760A8A0, 	0x8C697397, 	0x624C9DB4, 	0xE85617AA, 	0x95836A7D  
 };
-__device__ __forceinline__ void Round8_3_final(uint32_t *A,
-		int r, int s, int t, int u) {
-

+__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u)
+{
 	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
 	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
 	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
--- a/x11/cuda_x11_simd512_sm2.cuh
+++ b/x11/cuda_x11_simd512_sm2.cuh
@ -0,0 +1,579 @@
				@@ -0,0 +1,579 @@
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 210
+#endif
+
+#if __CUDA_ARCH__ < 300
+
+#define T32(x) (x)
+
+#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */
+__constant__  uint32_t c_IV_512[32];
+const uint32_t h_IV_512[32] = {
+	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+};
+
+__constant__ int c_FFT128_8_16_Twiddle[128];
+static const int h_FFT128_8_16_Twiddle[128] = {
+	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
+	1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
+	1, -67, 120, -73,   8,  -22, -68, -70,  64,  81, -30, -46,  -2,-123,  17,-111,
+	1,-118,  46, -31,  60,  116, -67, -61,   2,  21,  92, -62, 120, -25, 123,-122,
+	1, 116,  92,-122, -17,   84, -22,  18,  32, 114, 117, -49, -30, 118,  67,  62,
+	1, -31, -67,  21, 120, -122, -73, -50,   8,   9, -22, -89, -68,  52, -70, 114,
+	1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
+};
+
+__constant__ int c_FFT256_2_128_Twiddle[128];
+static const int h_FFT256_2_128_Twiddle[128] = {
+	  1,  41,-118,  45,  46,  87, -31,  14,
+	 60,-110, 116,-127, -67,  80, -61,  69,
+	  2,  82,  21,  90,  92, -83, -62,  28,
+	120,  37, -25,   3, 123, -97,-122,-119,
+	  4, -93,  42, -77, -73,  91,-124,  56,
+	-17,  74, -50,   6, -11,  63,  13,  19,
+	  8,  71,  84, 103, 111, -75,   9, 112,
+	-34,-109,-100,  12, -22, 126,  26,  38,
+	 16,-115, -89, -51, -35, 107,  18, -33,
+	-68,  39,  57,  24, -44,  -5,  52,  76,
+	 32,  27,  79,-102, -70, -43,  36, -66,
+	121,  78, 114,  48, -88, -10, 104,-105,
+	 64,  54, -99,  53, 117, -86,  72, 125,
+	-15,-101, -29,  96,  81, -20, -49,  47,
+	128, 108,  59, 106, -23,  85,-113,  -7,
+	-30,  55, -58, -65, -95, -40, -98,  94
+};
+#endif
+
+__constant__ int c_FFT[256] = 
+//const int h_FFT[256] =
+{
+	// this is the FFT result in revbin permuted order
+	4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
+	-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
+	-63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76,
+	76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47,
+	5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93,
+	-93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117,
+	-117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128,
+	-21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122,
+	-122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24,
+	-24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110,
+	-51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99,
+	99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4,
+	79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7,
+	-29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93,
+	-93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102
+};
+
+__constant__ int c_P8[32][8] = {
+//static const int h_P8[32][8] = {
+	{ 2, 66, 34, 98, 18, 82, 50, 114 },
+	{ 6, 70, 38, 102, 22, 86, 54, 118 },
+	{ 0, 64, 32, 96, 16, 80, 48, 112 },
+	{ 4, 68, 36, 100, 20, 84, 52, 116 },
+	{ 14, 78, 46, 110, 30, 94, 62, 126 },
+	{ 10, 74, 42, 106, 26, 90, 58, 122 },
+	{ 12, 76, 44, 108, 28, 92, 60, 124 },
+	{ 8, 72, 40, 104, 24, 88, 56, 120 },
+	{ 15, 79, 47, 111, 31, 95, 63, 127 },
+	{ 13, 77, 45, 109, 29, 93, 61, 125 },
+	{ 3, 67, 35, 99, 19, 83, 51, 115 },
+	{ 1, 65, 33, 97, 17, 81, 49, 113 },
+	{ 9, 73, 41, 105, 25, 89, 57, 121 },
+	{ 11, 75, 43, 107, 27, 91, 59, 123 },
+	{ 5, 69, 37, 101, 21, 85, 53, 117 },
+	{ 7, 71, 39, 103, 23, 87, 55, 119 },
+	{ 8, 72, 40, 104, 24, 88, 56, 120 },
+	{ 4, 68, 36, 100, 20, 84, 52, 116 },
+	{ 14, 78, 46, 110, 30, 94, 62, 126 },
+	{ 2, 66, 34, 98, 18, 82, 50, 114 },
+	{ 6, 70, 38, 102, 22, 86, 54, 118 },
+	{ 10, 74, 42, 106, 26, 90, 58, 122 },
+	{ 0, 64, 32, 96, 16, 80, 48, 112 },
+	{ 12, 76, 44, 108, 28, 92, 60, 124 },
+	{ 134, 198, 166, 230, 150, 214, 182, 246 },
+	{ 128, 192, 160, 224, 144, 208, 176, 240 },
+	{ 136, 200, 168, 232, 152, 216, 184, 248 },
+	{ 142, 206, 174, 238, 158, 222, 190, 254 },
+	{ 140, 204, 172, 236, 156, 220, 188, 252 },
+	{ 138, 202, 170, 234, 154, 218, 186, 250 },
+	{ 130, 194, 162, 226, 146, 210, 178, 242 },
+	{ 132, 196, 164, 228, 148, 212, 180, 244 },
+};
+
+__constant__ int c_Q8[32][8] = {
+//static const int h_Q8[32][8] = {
+	{ 130, 194, 162, 226, 146, 210, 178, 242 },
+	{ 134, 198, 166, 230, 150, 214, 182, 246 },
+	{ 128, 192, 160, 224, 144, 208, 176, 240 },
+	{ 132, 196, 164, 228, 148, 212, 180, 244 },
+	{ 142, 206, 174, 238, 158, 222, 190, 254 },
+	{ 138, 202, 170, 234, 154, 218, 186, 250 },
+	{ 140, 204, 172, 236, 156, 220, 188, 252 },
+	{ 136, 200, 168, 232, 152, 216, 184, 248 },
+	{ 143, 207, 175, 239, 159, 223, 191, 255 },
+	{ 141, 205, 173, 237, 157, 221, 189, 253 },
+	{ 131, 195, 163, 227, 147, 211, 179, 243 },
+	{ 129, 193, 161, 225, 145, 209, 177, 241 },
+	{ 137, 201, 169, 233, 153, 217, 185, 249 },
+	{ 139, 203, 171, 235, 155, 219, 187, 251 },
+	{ 133, 197, 165, 229, 149, 213, 181, 245 },
+	{ 135, 199, 167, 231, 151, 215, 183, 247 },
+	{ 9, 73, 41, 105, 25, 89, 57, 121 },
+	{ 5, 69, 37, 101, 21, 85, 53, 117 },
+	{ 15, 79, 47, 111, 31, 95, 63, 127 },
+	{ 3, 67, 35, 99, 19, 83, 51, 115 },
+	{ 7, 71, 39, 103, 23, 87, 55, 119 },
+	{ 11, 75, 43, 107, 27, 91, 59, 123 },
+	{ 1, 65, 33, 97, 17, 81, 49, 113 },
+	{ 13, 77, 45, 109, 29, 93, 61, 125 },
+	{ 135, 199, 167, 231, 151, 215, 183, 247 },
+	{ 129, 193, 161, 225, 145, 209, 177, 241 },
+	{ 137, 201, 169, 233, 153, 217, 185, 249 },
+	{ 143, 207, 175, 239, 159, 223, 191, 255 },
+	{ 141, 205, 173, 237, 157, 221, 189, 253 },
+	{ 139, 203, 171, 235, 155, 219, 187, 251 },
+	{ 131, 195, 163, 227, 147, 211, 179, 243 },
+	{ 133, 197, 165, 229, 149, 213, 181, 245 },
+};
+
+#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
+	((x)%7) == 1 ? 6 : \
+	((x)%7) == 2 ? 2 : \
+	((x)%7) == 3 ? 3 : \
+	((x)%7) == 4 ? 5 : \
+	((x)%7) == 5 ? 7 : 4 )
+
+/************* the round function ****************/
+
+#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
+
+__device__ __forceinline__
+void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+{
+	uint32_t R[8];
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]);
+		D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
+		A[j] = R[j];
+	}
+}
+
+__device__ __forceinline__
+void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+{
+	uint32_t R[8];
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]);
+		D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
+		A[j] = R[j];
+	}
+}
+
+__device__ __forceinline__
+void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
+{
+	uint32_t w[8][8];
+	int code = i<2? 185: 233;
+	int a, b;
+
+	/*
+	 * The FFT output y is in revbin permuted order,
+	 * but this is included in the tables P and Q
+	 */
+
+	#pragma unroll 8
+	for(a=0; a<8; a++) {
+		#pragma unroll 8
+		for(b=0; b<8; b++) {
+			w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
+		}
+	}
+
+	STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A);
+
+	STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A);
+}
+
+
+/********************* Message expansion ************************/
+
+/*
+ * Reduce modulo 257; result is in [-127; 383]
+ * REDUCE(x) := (x&255) - (x>>8)
+ */
+#define REDUCE(x) (((x)&255) - ((x)>>8))
+
+/*
+ * Reduce from [-127; 383] to [-128; 128]
+ * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
+ */
+#define EXTRA_REDUCE_S(x) \
+	((x)<=128 ? (x) : (x)-257)
+
+/*
+ * Reduce modulo 257; result is in [-128; 128]
+ */
+#define REDUCE_FULL_S(x) \
+	EXTRA_REDUCE_S(REDUCE(x))
+
+__device__ __forceinline__
+void FFT_8(int *y, int stripe)
+{
+ /*
+	* FFT_8 using w=4 as 8th root of unity
+	* Unrolled decimation in frequency (DIF) radix-2 NTT.
+	* Output data is in revbin_permuted order.
+	*/
+#define X(i) y[stripe*i]
+
+#define DO_REDUCE(i) \
+	X(i) = REDUCE(X(i))
+
+#define DO_REDUCE_FULL_S(i) do { \
+	X(i) = REDUCE(X(i)); \
+	X(i) = EXTRA_REDUCE_S(X(i)); \
+} while(0)
+
+#define BUTTERFLY(i,j,n) do { \
+	int u= X(i); \
+	int v= X(j); \
+	X(i) = u+v; \
+	X(j) = (u-v) << (2*n); \
+} while(0)
+
+	BUTTERFLY(0, 4, 0);
+	BUTTERFLY(1, 5, 1);
+	BUTTERFLY(2, 6, 2);
+	BUTTERFLY(3, 7, 3);
+
+	DO_REDUCE(6);
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 2, 0);
+	BUTTERFLY(4, 6, 0);
+	BUTTERFLY(1, 3, 2);
+	BUTTERFLY(5, 7, 2);
+
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 1, 0);
+	BUTTERFLY(2, 3, 0);
+	BUTTERFLY(4, 5, 0);
+	BUTTERFLY(6, 7, 0);
+
+	DO_REDUCE_FULL_S(0);
+	DO_REDUCE_FULL_S(1);
+	DO_REDUCE_FULL_S(2);
+	DO_REDUCE_FULL_S(3);
+	DO_REDUCE_FULL_S(4);
+	DO_REDUCE_FULL_S(5);
+	DO_REDUCE_FULL_S(6);
+	DO_REDUCE_FULL_S(7);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+}
+
+__device__ __forceinline__
+void FFT_16(int *y, int stripe)
+{
+	/*
+	 * FFT_16 using w=2 as 16th root of unity
+	 * Unrolled decimation in frequency (DIF) radix-2 NTT.
+	 * Output data is in revbin_permuted order.
+	 */
+
+	#define X(i) y[stripe*i]
+
+	#define DO_REDUCE(i) \
+		X(i) = REDUCE(X(i))
+
+	#define DO_REDUCE_FULL_S(i) \
+		do { \
+		X(i) = REDUCE(X(i)); \
+		X(i) = EXTRA_REDUCE_S(X(i)); \
+	} while(0)
+
+	#define BUTTERFLY(i,j,n) \
+		do { \
+		int u= X(i); \
+		int v= X(j); \
+		X(i) = u+v; \
+		X(j) = (u-v) << n; \
+	} while(0)
+
+	BUTTERFLY(0, 8, 0);
+	BUTTERFLY(1, 9, 1);
+	BUTTERFLY(2, 10, 2);
+	BUTTERFLY(3, 11, 3);
+	BUTTERFLY(4, 12, 4);
+	BUTTERFLY(5, 13, 5);
+	BUTTERFLY(6, 14, 6);
+	BUTTERFLY(7, 15, 7);
+
+	DO_REDUCE(11);
+	DO_REDUCE(12);
+	DO_REDUCE(13);
+	DO_REDUCE(14);
+	DO_REDUCE(15);
+
+	BUTTERFLY( 0, 4, 0);
+	BUTTERFLY( 1, 5, 2);
+	BUTTERFLY( 2, 6, 4);
+	BUTTERFLY( 3, 7, 6);
+
+	BUTTERFLY( 8, 12, 0);
+	BUTTERFLY( 9, 13, 2);
+	BUTTERFLY(10, 14, 4);
+	BUTTERFLY(11, 15, 6);
+
+	DO_REDUCE(5);
+	DO_REDUCE(7);
+	DO_REDUCE(13);
+	DO_REDUCE(15);
+
+	BUTTERFLY( 0, 2, 0);
+	BUTTERFLY( 1, 3, 4);
+	BUTTERFLY( 4, 6, 0);
+	BUTTERFLY( 5, 7, 4);
+
+	BUTTERFLY( 8, 10, 0);
+	BUTTERFLY(12, 14, 0);
+	BUTTERFLY( 9, 11, 4);
+	BUTTERFLY(13, 15, 4);
+
+	BUTTERFLY( 0, 1, 0);
+	BUTTERFLY( 2, 3, 0);
+	BUTTERFLY( 4, 5, 0);
+	BUTTERFLY( 6, 7, 0);
+
+	BUTTERFLY( 8, 9, 0);
+	BUTTERFLY(10, 11, 0);
+	BUTTERFLY(12, 13, 0);
+	BUTTERFLY(14, 15, 0);
+
+	DO_REDUCE_FULL_S( 0);
+	DO_REDUCE_FULL_S( 1);
+	DO_REDUCE_FULL_S( 2);
+	DO_REDUCE_FULL_S( 3);
+	DO_REDUCE_FULL_S( 4);
+	DO_REDUCE_FULL_S( 5);
+	DO_REDUCE_FULL_S( 6);
+	DO_REDUCE_FULL_S( 7);
+	DO_REDUCE_FULL_S( 8);
+	DO_REDUCE_FULL_S( 9);
+	DO_REDUCE_FULL_S(10);
+	DO_REDUCE_FULL_S(11);
+	DO_REDUCE_FULL_S(12);
+	DO_REDUCE_FULL_S(13);
+	DO_REDUCE_FULL_S(14);
+	DO_REDUCE_FULL_S(15);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+}
+
+__device__ __forceinline__
+void FFT_128_full(int *y)
+{
+	#pragma unroll 16
+	for (int i=0; i<16; i++) {
+		FFT_8(y+i,16);
+	}
+
+	#pragma unroll 128
+	for (int i=0; i<128; i++)
+		/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]);
+
+	#pragma unroll 8
+	for (int i=0; i<8; i++) {
+		FFT_16(y+16*i,1);
+	}
+}
+
+__device__ __forceinline__
+void FFT_256_halfzero(int y[256])
+{
+	/*
+	* FFT_256 using w=41 as 256th root of unity.
+	* Decimation in frequency (DIF) NTT.
+	* Output data is in revbin_permuted order.
+	* In place.
+	*/
+	const int tmp = y[127];
+
+	#pragma unroll 127
+	for (int i=0; i<127; i++)
+		y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]);
+
+	/* handle X^255 with an additionnal butterfly */
+	y[127] = REDUCE(tmp + 1);
+	y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
+
+	FFT_128_full(y);
+	FFT_128_full(y+128);
+}
+
+__device__ __forceinline__
+void SIMD_Compress(uint32_t A[32], const int *expanded, const uint32_t *M)
+{
+	uint32_t IV[4][8];
+
+	/* Save the chaining value for the feed-forward */
+
+	#pragma unroll 8
+	for(int i=0; i<8; i++) {
+		IV[0][i] = A[i];
+		IV[1][i] = (&A[8])[i];
+		IV[2][i] = (&A[16])[i];
+		IV[3][i] = (&A[24])[i];
+	}
+
+	/* XOR the message to the chaining value */
+	/* we can XOR word-by-word */
+	#pragma unroll 8
+	for(int i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+
+	/* Run the feistel ladders with the expanded message */
+	Round8(A, expanded, 0, 3, 23, 17, 27);
+	Round8(A, expanded, 1, 28, 19, 22, 7);
+	Round8(A, expanded, 2, 29, 9, 15, 5);
+	Round8(A, expanded, 3, 4, 13, 10, 25);
+
+	STEP8_IF(IV[0], 32,  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF(IV[3], 35, 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+
+/***************************************************/
+
+__device__ __forceinline__
+void SIMDHash(const uint32_t *data, uint32_t *hashval)
+{
+	uint32_t A[32];
+	uint32_t buffer[16];
+
+	#pragma unroll 32
+	for (int i=0; i < 32; i++) A[i] = c_IV_512[i];
+
+	#pragma unroll 16
+	for (int i=0; i < 16; i++) buffer[i] = data[i];
+
+	/* Message Expansion using Number Theoretical Transform similar to FFT */
+	int expanded[256];
+	{
+		#pragma unroll 16
+		for(int i=0; i<64; i+=4) {
+			expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440);
+			expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441);
+			expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442);
+			expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443);
+		}
+
+		#pragma unroll 16
+		for(int i=64; i<128; i+=4) {
+			expanded[i+0] = 0;
+			expanded[i+1] = 0;
+			expanded[i+2] = 0;
+			expanded[i+3] = 0;
+		}
+
+		FFT_256_halfzero(expanded);
+	}
+
+	/* Compression Function */
+	 SIMD_Compress(A, expanded, buffer);
+
+	/* Padding Round with known input (hence the FFT can be precomputed) */
+	buffer[0] = 512;
+
+	#pragma unroll 15
+	for (int i=1; i < 16; i++) buffer[i] = 0;
+
+	SIMD_Compress(A, c_FFT, buffer);
+
+	#pragma unroll 16
+	for (int i=0; i < 16; i++)
+		hashval[i] = A[i];
+}
+
+/***************************************************/
+__global__
+void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*) &g_hash[8 * hashPosition];
+
+		SIMDHash(Hash, Hash);
+	}
+}
+
+#else
+__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+#endif /* __CUDA_ARCH__ */
+
+__host__
+static  void x11_simd512_cpu_init_sm2(int thr_id)
+{
+#ifndef DEVICE_DIRECT_CONSTANTS
+	cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
+#endif
+//	cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
+//	cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
+//	cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -43,6 +43,21 @@ extern "C" void s3hash(void *output, const void *input)
				@@ -43,6 +43,21 @@ extern "C" void s3hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

+#ifdef _DEBUG
+#define TRACE(algo) { \
+	if (max_nonce == 1 && pdata[19] <= 1) { \
+		uint32_t* debugbuf = NULL; \
+		cudaMallocHost(&debugbuf, 32); \
+		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
+		printf("S3 %s %08x %08x %08x %08x...%08x\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
+		cudaFreeHost(debugbuf); \
+	} \
+}
+#else
+#define TRACE(algo) {}
+#endif
+
 static bool init[MAX_GPUS] = { 0 };

 /* Main S3 entry point */
@ -60,18 +75,24 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
				@@ -60,18 +75,24 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xF;
+		ptarget[7] = 0xF;

 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));

 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);

-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-
 		cuda_check_cpu_init(thr_id, throughput);

 		init[thr_id] = true;
@ -90,8 +111,11 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
				@@ -90,8 +111,11 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 		int order = 0;

 		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		TRACE("shavite:");
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("simd   :");
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("skein  :");

 		*hashes_done = pdata[19] - first_nonce + throughput;

--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -114,11 +114,17 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -114,11 +114,17 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x5;
+		ptarget[7] = 0x5;

 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);