diff --git a/configure.ac b/configure.ac
index c369201..2bd9d2e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu
index c38de11..d44d77b 100644
--- a/skunk/cuda_skunk_streebog.cu
+++ b/skunk/cuda_skunk_streebog.cu
@@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
 __constant__ uint64_t target64[4];
 
 __host__
-void skunk_set_target(uint32_t* ptarget)
+void skunk_streebog_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
diff --git a/skunk/skunk.cu b/skunk/skunk.cu
index f89c5fb..c1add50 100644
--- a/skunk/skunk.cu
+++ b/skunk/skunk.cu
@@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 
 // krnlx merged kernel (for high-end cards only)
 extern void skunk_cpu_init(int thr_id, uint32_t threads);
-extern void skunk_set_target(uint32_t* ptarget);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
 extern void skunk_setBlock_80(int thr_id, void *pdata);
 extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
@@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 	if (use_compat_kernels[thr_id]) {
 		skein512_cpu_setBlock_80(endiandata);
-		streebog_set_target(ptarget);
+		streebog_sm3_set_target(ptarget);
 	} else {
 		skunk_setBlock_80(thr_id, endiandata);
-		skunk_set_target(ptarget);
+		skunk_streebog_set_target(ptarget);
 	}
 
 	do {
@@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 			x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 			x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-			streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 		} else {
 			skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu
index 228c691..d6e3685 100644
--- a/x11/cuda_streebog.cu
+++ b/x11/cuda_streebog.cu
@@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 #define T6(x) shared[6][x]
 #define T7(x) shared[7][x]
 
+// Streebog final for Veltor and skunk on SM 3.x
 __constant__ uint64_t target64[4];
 
 __host__
-void streebog_set_target(const uint32_t* ptarget)
+void streebog_sm3_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
 }
@@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
 }
 
 __host__
-void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
+void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
 {
 	dim3 grid((threads + TPB-1) / TPB);
 	dim3 block(TPB);
diff --git a/x11/veltor.cu b/x11/veltor.cu
index be05f5a..7bc1e18 100644
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@@ -10,11 +10,17 @@ extern "C" {
 #include "cuda_x11.h"
 
 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+
+// for SM3.x
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// for latest cards only
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 
 #include <stdio.h>
 #include <memory.h>
@@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
 
-// veltorcoin CPU Hash
+// veltor CPU Hash
 extern "C" void veltorhash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
@@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		quark_skein512_cpu_init(thr_id, throughput);
+		skunk_cpu_init(thr_id, throughput);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
 		x11_shavite512_cpu_init(thr_id, throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
@@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 	skein512_cpu_setBlock_80(endiandata);
 
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
-	streebog_set_target(ptarget);
+	if(use_compat_kernels[thr_id])
+		streebog_sm3_set_target(ptarget);
+	else
+		skunk_streebog_set_target(ptarget);
 
 	do {
 		int order = 0;
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		if(use_compat_kernels[thr_id])
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		else
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 
 		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);