diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 4ecec43..6cbd917 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -257,9 +257,8 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
 		return result;
 
 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
-	cudaDeviceSynchronize();
+	MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		//cudaThreadSynchronize(); /* seems no more required */
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
 			extra_results[n] = h_resNonce[thr_id][n+1];
@@ -343,9 +342,8 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
 		return result;
 
 	blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
-	cudaDeviceSynchronize();
+	MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		//cudaThreadSynchronize(); /* seems no more required */
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
 			extra_results[n] = h_resNonce[thr_id][n+1];
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index 3afd225..d8244ec 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -456,7 +456,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *
 		lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
 	}
 
-	cudaDeviceSynchronize();
-	//MyStreamSynchronize(NULL, order, thr_id);
+	MyStreamSynchronize(NULL, order, thr_id);
+	//cudaThreadSynchronize();
 }
 
diff --git a/pentablake.cu b/pentablake.cu
index 49f49e3..a971910 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -316,8 +316,7 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce,
 
 	pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
 
-	//MyStreamSynchronize(NULL, order, thr_id);
-	cudaDeviceSynchronize();
+	MyStreamSynchronize(NULL, order, thr_id);
 }
 
 
@@ -375,8 +374,7 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3
 
 	pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
 
-	//MyStreamSynchronize(NULL, order, thr_id);
-	cudaDeviceSynchronize();
+	MyStreamSynchronize(NULL, order, thr_id);
 }
 
 #if 0
@@ -456,7 +454,7 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc
 
 	pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
 
-	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	CUDA_SAFE_CALL(cudaThreadSynchronize());
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
 		result = h_resNounce[thr_id][0];
@@ -560,6 +558,5 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
-	cudaDeviceSynchronize();
 	return rc;
 }
diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu
index a4a0ef2..c655538 100644
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@@ -466,7 +466,7 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start
 	size_t shared_size = 0;
 
 	qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		//cudaThreadSynchronize();
 		result = h_resNounce[thr_id][0];