diff --git a/gost/cuda_gosthash.cu b/gost/cuda_gosthash.cu
index becdddf..3846493 100644
--- a/gost/cuda_gosthash.cu
+++ b/gost/cuda_gosthash.cu
@@ -577,6 +577,14 @@ void GOST_Copy512(uint64_t* dst, uint64_t* const __restrict__ src)
 		dst[i] = src[i];
 }
 
+__device__ __forceinline__
+void GOST_Copy256(uint64_t* dst, uint64_t* const __restrict__ src)
+{
+	#pragma unroll
+	for (int i=0; i<4; i++)
+		dst[i] = src[i];
+}
+
 __device__ __forceinline__
 void GOST_Xor512(uint64_t* C, uint64_t* const A, const uint64_t* B)
 {
@@ -1058,21 +1066,21 @@ void GOST_hash_X(uint64_t *hash, uchar * const message, uint64_t len)
 
 __global__
 __launch_bounds__(128, 3)
-void streebog_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
+void streebog_gpu_hash_64(uint32_t threads, uint64_t *g_hash) // 80 bytes input
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t* inout = (&g_hash[thread * 8U]);
 		uint64_t hash[8] = { 0 }; //iv 
-		GOST_hash_X(hash, (uchar*) inout, 512);
+		GOST_hash_X(hash, (uchar*) inout, 640);
 		GOST_Copy512(inout, hash);
 	}
 }
 
 __global__
 __launch_bounds__(128, 3)
-void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash)
+void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash) // 64 bytes input
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -1081,12 +1089,12 @@ void streebog_gpu_hash_32(uint32_t threads, uint64_t *g_hash)
 		uint64_t hash[8];
 		memset (&hash, 1, 64); // iv
 		GOST_hash_X(hash, (uchar*) inout, 512);
-		GOST_Copy512(inout, hash);
+		GOST_Copy256(inout, hash);
 	}
 }
 
 __host__
-void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const int threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
@@ -1096,7 +1104,7 @@ void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *
 }
 
 __host__
-void gost_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const int threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
diff --git a/gost/gost.cu b/gost/gost.cu
index f145e3d..273ff6c 100644
--- a/gost/gost.cu
+++ b/gost/gost.cu
@@ -32,8 +32,8 @@ extern "C" void gosthash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-extern void gost_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
-extern void gost_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void gost_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void gost_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 //#define _DEBUG
 #define _DEBUG_PREFIX "sib"
@@ -66,24 +66,23 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 80 * throughput), -1);
 
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
 
-	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], pdata[k]);
+		be32enc(&d_hash[thr_id][k], pdata[k]);
 
 	do {
 		int order = 0;
 
 		// Hash with CUDA
-		gost_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		gost_hash_64(thr_id, throughput, d_hash[thr_id]);
 		TRACE("gost64   :");
-		gost_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		gost_hash_32(thr_id, throughput, d_hash[thr_id]);
 		TRACE("gost32   :");
 
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
@@ -91,8 +90,8 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
-			be32enc(&endiandata[19], work->nonces[0]);
-			gosthash(vhash, endiandata);
+			be32enc(&d_hash[thr_id][19], work->nonces[0]);
+			gosthash(vhash, d_hash[thr_id]);
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
@@ -100,8 +99,8 @@ extern "C" int scanhash_gost(int thr_id, struct work* work, uint32_t max_nonce,
 				work->nonces[1] =cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (work->nonces[1] != 0) {
-					be32enc(&endiandata[19], work->nonces[1]);
-					sibhash(vhash, endiandata);
+					be32enc(&d_hash[thr_id][19], work->nonces[1]);
+					sibhash(vhash, d_hash[thr_id]);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;