diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index cda5f1c..dcea61f 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -15,6 +15,7 @@ typedef uint48 uint4x2;
 #ifdef __INTELLISENSE__
 #define __CUDA_ARCH__ 500
 #define __byte_perm(x,y,c) x
+#define __shfl(x,y,c) x
 #define atomicExch(p,x) x
 #endif
 
@@ -80,26 +81,6 @@ __constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
 #define shf_r_clamp32(out,a,b,shift) \
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift));
 
-__device__ __forceinline__
-static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
-{
-#if __CUDA_ARCH__ >= 320
-	uint32_t shift = 32U - shift2;
-	asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
-	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
-	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
-#else
-	// to check
-	shift256R(ret, vec4, shift2);
-#endif
-}
-
 #if __CUDA_ARCH__ >= 300
 __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
 {
@@ -165,6 +146,26 @@ __device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_
 
 #if __CUDA_ARCH__ < 500
 
+__device__ __forceinline__
+static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
+{
+#if __CUDA_ARCH__ >= 320
+	uint32_t shift = 32U - shift2;
+	asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+#else
+	// to check
+	shift256R(ret, vec4, shift2);
+#endif
+}
+
 #define BLAKE(a, b, c, d, key1, key2) { \
 	a += key1; \
 	a += b; d = rotateL(d^a, 16); \
@@ -721,7 +722,6 @@ static __forceinline__ __device__
 void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
 {
 	uint2x4 output[8];
-	uchar4 bufhelper;
 	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
 	uint32_t qbuf, rbuf, bitbuf;
 	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
@@ -787,6 +787,9 @@ void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const
 
 		for (int k = 0; k<8; k++)
 			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+		printf("", data18, data20);
 #endif
 		Blake2S(input, input, key);
 	}
@@ -1095,6 +1098,7 @@ uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const sal
 		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
 #else
 		//#error SM 3.0 code missing here
+		printf("", data18, data20);
 #endif
 		for (int k = 0; k < 9; k++) {
 			B0[(k + qbuf) & 0x3f] = temp[k];
@@ -1465,7 +1469,7 @@ static __thread uint32_t *Trans2 = NULL; // 2 streams
 static __thread uint32_t *Trans3 = NULL; // 2 streams
 
 __host__
-void neoscrypt_init_2stream(int thr_id, uint32_t threads)
+void neoscrypt_init(int thr_id, uint32_t threads)
 {
 	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
@@ -1480,7 +1484,7 @@ void neoscrypt_init_2stream(int thr_id, uint32_t threads)
 }
 
 __host__
-void neoscrypt_free_2stream(int thr_id)
+void neoscrypt_free(int thr_id)
 {
 	cudaFree(d_NNonce[thr_id]);
 
@@ -1491,18 +1495,15 @@ void neoscrypt_free_2stream(int thr_id)
 }
 
 __host__
-void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
+void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
 {
 	CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t)));
 
-	const int threadsperblock = TPB;
-	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
-	dim3 block(threadsperblock);
-
 	const int threadsperblock2 = TPB2;
 	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
 	dim3 block2(threadsperblock2);
 
+	const int threadsperblock = TPB;
 	dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock);
 	dim3 block3(4, threadsperblock >> 2);
 
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index ad094b6..fd38f23 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -6,9 +6,9 @@
 
 extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
 
-extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
-extern void neoscrypt_free_2stream(int thr_id);
-extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
+extern void neoscrypt_init(int thr_id, uint32_t threads);
+extern void neoscrypt_free(int thr_id);
+extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
 
 static bool init[MAX_GPUS] = { 0 };
 
@@ -21,14 +21,12 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 
 	int dev_id = device_map[thr_id];
 	int intensity = is_windows() ? 18 : 19;
-	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; // also need more than 2GB
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
 
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
 	api_set_throughput(thr_id, throughput);
 
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);
-
 	if (opt_benchmark)
 		ptarget[7] = 0x00ff;
 
@@ -49,7 +47,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 		}
 
 		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
-		neoscrypt_init_2stream(thr_id, throughput);
+		neoscrypt_init(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
@@ -66,7 +64,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 
 	do {
 		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
-		neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);
+		neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
@@ -111,7 +109,7 @@ void free_neoscrypt(int thr_id)
 
 	cudaThreadSynchronize();
 
-	neoscrypt_free_2stream(thr_id);
+	neoscrypt_free(thr_id);
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();