diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 860d1c7..af5d2a4 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -265,7 +265,7 @@ int scanhash_heavy(int thr_id, uint32_t *pdata,
         {
             size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
             CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
-            cudaDeviceSynchronize();
+            cudaThreadSynchronize();
 
             for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
             {
diff --git a/pentablake.cu b/pentablake.cu
index a971910..c10ce48 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -127,99 +127,6 @@ static const uint64_t d_constHashPadding[8] = {
 	0x0002000000000000ull
 };
 
-#if 0
-
-__device__ __constant__
-static const uint64_t __align__(32) c_Padding[16] = {
-	0, 0, 0, 0,
-	0x80000000ULL, 0, 0, 0,
-	0, 0, 0, 0,
-	0, 1, 0, 640,
-};
-
-__device__ static
-void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
-{
-	uint64_t v[16], m[16];
-
-	m[0] = block[0];
-	m[1] = block[1];
-	m[2] = block[2];
-	m[3] = block[3];
-
-	for (uint32_t i = 4; i < 16; i++) {
-		m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
-	}
-
-	//#pragma unroll 8
-	for(uint32_t i = 0; i < 8; i++)
-		v[i] = h[i];
-
-	v[ 8] = c_u512[0];
-	v[ 9] = c_u512[1];
-	v[10] = c_u512[2];
-	v[11] = c_u512[3];
-
-	v[12] = xor1(c_u512[4], T0);
-	v[13] = xor1(c_u512[5], T0);
-	v[14] = c_u512[6];
-	v[15] = c_u512[7];
-
-	for (uint32_t i = 0; i < 16; i++) {
-		/* column step */
-		G(0, 4, 0x8, 0xC, 0x0);
-		G(1, 5, 0x9, 0xD, 0x2);
-		G(2, 6, 0xA, 0xE, 0x4);
-		G(3, 7, 0xB, 0xF, 0x6);
-		/* diagonal step */
-		G(0, 5, 0xA, 0xF, 0x8);
-		G(1, 6, 0xB, 0xC, 0xA);
-		G(2, 7, 0x8, 0xD, 0xC);
-		G(3, 4, 0x9, 0xE, 0xE);
-	}
-
-	//#pragma unroll 16
-	for (uint32_t i = 0; i < 16; i++) {
-		uint32_t j = i % 8;
-		h[j] ^= v[i];
-	}
-}
-
-__global__
-void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		const uint32_t nounce = startNounce + thread;
-		uint64_t h[8];
-
-		#pragma unroll
-		for(int i=0; i<8; i++) {
-			h[i] = c_IV512[i];
-		}
-
-		uint64_t ending[4];
-		ending[0] = c_data[16];
-		ending[1] = c_data[17];
-		ending[2] = c_data[18];
-		ending[3] = nounce; /* our tested value */
-
-		pentablake_compress(h, ending, 640);
-
-		// -----------------------------------
-
-		for (int r = 0; r < 4; r++) {
-			uint64_t data[8];
-			for (int i = 0; i < 7; i++) {
-				data[i] = h[i];
-			}
-			pentablake_compress(h, data, 512); /* todo: use h,h when ok*/
-		}
-	}
-}
-#endif
-
 __device__ static
 void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
 {
@@ -377,33 +284,6 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 
-#if 0
-
-__host__
-uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
-{
-	const int threadsperblock = TPB;
-	uint32_t result = UINT32_MAX;
-
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-	size_t shared_size = 0;
-
-	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
-	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
-		return result;
-
-	pentablake_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
-	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		cudaThreadSynchronize();
-		result = h_resNounce[thr_id][0];
-		extra_results[0] = h_resNounce[thr_id][1];
-	}
-	return result;
-}
-#endif
-
 __global__
 void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
 {