diff --git a/blake32.cu b/blake32.cu
index 5f4043b..025a2af 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
 
 __global__
 void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
-	const uint8_t nClzTarget, const int crcsum, const int rounds)
+	const uint64_t highTarget, const int crcsum, const int rounds)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 
 		// compare count of leading zeros h[6] + h[7]
 		uint64_t high64 = ((uint64_t*)h)[3];
-		uint32_t clz = cuda_clz64(high64);
-
-		if (clz >= nClzTarget)
+		if (high64 <= highTarget)
 #if NBN == 2
 		/* keep the smallest nounce, + extra one if found */
 		if (resNounce[0] > nounce) {
+			// printf("%llx %llx \n", high64, highTarget);
 			resNounce[1] = resNounce[0];
 			resNounce[0] = nounce;
 		}
@@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
+uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
 	const uint32_t crcsum, const int8_t rounds)
 {
 	const int threadsperblock = TPB;
@@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
 	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
 	cudaDeviceSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		//cudaThreadSynchronize(); /* seems no more required */
@@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
 	uint64_t targetHigh = ((uint64_t*)ptarget)[3];
-	uint32_t clzTarget = cuda_clz64(targetHigh);
 	uint32_t crcsum = MAXU;
 	int rc = 0;
 
@@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 	do {
 		// GPU HASH
-		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
+		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
 		if (foundNonce != MAXU)
 		{
 			uint32_t endiandata[20];
diff --git a/cuda_helper.h b/cuda_helper.h
index 56b80a7..4755d8a 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 #endif
 
 // 64-bit ROTATE LEFT
-#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
+#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
 __device__ __forceinline__
 uint64_t ROTL64(const uint64_t value, const int offset) {
 	uint2 result;
@@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) {
 	}
 	return  __double_as_longlong(__hiloint2double(result.y, result.x));
 }
-#elif __CUDA_ARCH__ >= 120
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
 __device__ __forceinline__
 uint64_t ROTL64(const uint64_t x, const int offset)
 {
@@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
 
-#ifdef WIN32
-#include <intrin.h>
-static uint32_t __inline __builtin_clz(uint32_t x) {
-	unsigned long r = 0;
-	_BitScanReverse(&r, x);
-	return (31-r);
-}
-static uint32_t __inline __builtin_ctz(uint32_t x) {
-	unsigned long r = 0;
-	_BitScanForward(&r, x);
-	return r;
-}
-#endif
-
-/* count leading zeros of a 64bit int */
-#if __CUDA_ARCH__ >= 200
-__device__
-static uint32_t cuda_clz64(const uint64_t x)
-{
-	uint32_t result;
-	asm("clz.b64 %0, %1;\n"
-		: "=r"(result) : "l"(x));
-	return result;
-}
-#else
-/* host */
-static uint32_t cuda_clz64(const uint64_t x)
-{
-	uint32_t u32 = (x >> 32);
-	uint32_t result = u32 ? __builtin_clz(u32) : 32;
-	if (result == 32) {
-		u32 = (uint32_t) x;
-		result += (u32 ? __builtin_clz(u32) : 32);
-	}
-	return result;
-}
-#endif
-
-/* count trailing zeros of a 32bit int */
-#if __CUDA_ARCH__ >= 200
-__device__
-static uint32_t cuda_ctz32(const uint32_t x)
-{
-	uint32_t result;
-	asm("brev.b32 %1, %1;\n\t"
-		"clz.b32 %0, %1;\n"
-		: "=r"(result) : "r"(x));
-	return result;
-}
-#else
-/* host */
-static uint32_t cuda_ctz32(const uint32_t x)
-{
-	return x ? __builtin_ctz(x) : 32;
-}
-#endif
-
 #endif // #ifndef CUDA_HELPER_H