diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 191db70..c3a3ea2 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -409,9 +409,9 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 	}
 
 	if (!init[thr_id]) {
-		if (active_gpus > 1)
-			cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(device_map[thr_id]);
 		CUDA_LOG_ERROR();
+
 		cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t));
 		cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t));
 		CUDA_LOG_ERROR();
diff --git a/README.txt b/README.txt
index 5c30709..4c1c0b7 100644
--- a/README.txt
+++ b/README.txt
@@ -229,7 +229,8 @@ features.
 >>> RELEASE HISTORY <<<
 
   Under Dev...    v1.7
-                  Improve lyra2 (v1) cuda implementation
+                  Add windows support for SM 2.1 and drop SM 3.5 (x86)
+                  Improve lyra2 (v1/v2) cuda implementation
                   Restore whirlpool algo (and whirlcoin variant)
                   Prepare algo switch ability
                   Add --benchmark alone to run a benchmark for all algos
diff --git a/bench.cpp b/bench.cpp
index 0a45a41..31944ae 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -96,6 +96,7 @@ bool bench_algo_switch_next(int thr_id)
 	// skip some duplicated algos
 	if (algo == ALGO_C11) algo++; // same as x11
 	if (algo == ALGO_DMD_GR) algo++; // same as groestl
+	if (algo == ALGO_MJOLLNIR) algo++; // same as heavy
 	if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
 
 	if (device_sm[dev_id] && device_sm[dev_id] < 300) {
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f97035c..f6465b3 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -158,7 +158,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_35,sm_35</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Optimization>O2</Optimization>
     </CudaCompile>
diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh
index e928e75..c85b862 100644
--- a/lyra2/cuda_lyra2v2_sm3.cuh
+++ b/lyra2/cuda_lyra2v2_sm3.cuh
@@ -17,7 +17,6 @@
 #define Ncol 4
 
 #define vectype ulonglong4
-#define u64type uint64_t
 #define memshift 4
 
 __device__ vectype *DMatrix;
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 54b8303..4553b5a 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -82,7 +82,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 	const uint32_t first_nonce = pdata[19];
 	int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
-	uint32_t throughput = cuda_default_throughput(dev_id, 1U << intensity);
+	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
@@ -100,6 +100,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		keccak256_cpu_init(thr_id,throughput);
 		skein256_cpu_init(thr_id, throughput);
 		bmw256_cpu_init(thr_id, throughput);
+		CUDA_LOG_ERROR();
 
 		// DMatrix (780Ti may prefer 16 instead of 12, cf djm34)
 		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput));
@@ -118,7 +119,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
 	blake256_cpu_setBlock_80(pdata);
 	bmw256_setTarget(ptarget);