diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu
index ca0f8f2..e2601e4 100644
--- a/Algo256/blake2s.cu
+++ b/Algo256/blake2s.cu
@@ -28,7 +28,7 @@ static __thread blake2s_state ALIGN(A) s_ctx;
 #endif
 
 #ifndef GPU_MIDSTATE
-__constant__ uint32_t d_data[20];
+__constant__ uint2 d_data[10];
 #else
 __constant__ blake2s_state ALIGN(8) d_state[1];
 #endif
@@ -42,7 +42,9 @@ static uint32_t *h_resNonce[MAX_GPUS];
 
 /* max count of found nonces in one call */
 #define NBN 2
+#if NBN > 1
 static uint32_t extra_results[NBN] = { UINT32_MAX };
+#endif
 
 extern "C" void blake2s_hash(void *output, const void *input)
 {
@@ -196,6 +198,7 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block)
 	#undef ROUND
 }
 
+#if 0
 /* unused but kept as reference */
 __device__ __forceinline__
 void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
@@ -225,17 +228,21 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
 		}
 	}
 }
+#endif
 
+#ifndef GPU_MIDSTATE
 __device__ __forceinline__
-void gpu_blake2s_update76(blake2s_state *S, const void *input)
+void gpu_blake2s_fill_data(blake2s_state *S, const uint32_t nonce)
 {
-	uint64_t *b64 = (uint64_t*) S->buf;
-	uint64_t *i64 = (uint64_t*) input;
+	uint2 *b2 = (uint2*) S->buf;
 	#pragma unroll
-	for (int i=0; i < 80/8; i++)
-		b64[i] = i64[i];
-	//S->buflen = 76;
+	for (int i=0; i < 9; i++)
+		b2[i] = d_data[i];
+	b2[9].x = d_data[9].x;
+	b2[9].y = nonce;
+	S->buflen = 80;
 }
+#endif
 
 __device__ __forceinline__
 void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce)
@@ -245,7 +252,7 @@ void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce)
 }
 
 __device__ __forceinline__
-void gpu_blake2s_final(blake2s_state *S, uint32_t *out)
+uint2 gpu_blake2s_final(blake2s_state *S)
 {
 	//if (S->buflen > BLAKE2S_BLOCKBYTES)
 	{
@@ -260,9 +267,10 @@ void gpu_blake2s_final(blake2s_state *S, uint32_t *out)
 	//memset(&S->buf[S->buflen], 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen); /* Padding */
 	gpu_blake2s_compress(S, (uint32_t*) (S->buf + BLAKE2S_BLOCKBYTES));
 
-	#pragma unroll
-	for (int i = 0; i < 8; i++)
-		out[i] = S->h[i];
+	//#pragma unroll
+	//for (int i = 0; i < 8; i++)
+	//	out[i] = S->h[i];
+	return make_uint2(S->h[6], S->h[7]);
 }
 
 /* init2 xors IV with input parameter block */
@@ -286,12 +294,13 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P)
 	S->buflen = 0;
 
 	#pragma unroll
-	for (int i = 0; i < sizeof(S->buf)/8; i++)
+	for (int i = 8; i < sizeof(S->buf)/8; i++)
 		gpu_store64(S->buf + (8*i), 0);
 
 	uint64_t *p = (uint64_t*) P;
 
 	/* IV XOR ParamBlock */
+	#pragma unroll
 	for (int i = 0; i < 4; i++)
 		S->h[i] ^= gpu_load64(&p[i]);
 }
@@ -333,25 +342,23 @@ void gpu_copystate(blake2s_state *dst, blake2s_state *src)
 }
 
 __global__
-void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2, const int swap)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	const uint32_t nonce = startNonce + thread;
+	const uint32_t nonce = swap ? cuda_swab32(startNonce + thread) : startNonce + thread;
 	blake2s_state ALIGN(8) blake2_ctx;
 
 #ifndef GPU_MIDSTATE
 	gpu_blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
 	//gpu_blake2s_update(&blake2_ctx, (uint8_t*) d_data, 76);
-	gpu_blake2s_update76(&blake2_ctx, (uint64_t*) d_data);
+	gpu_blake2s_fill_data(&blake2_ctx, nonce);
 #else
 	gpu_copystate(&blake2_ctx, &d_state[0]);
-#endif
 	gpu_blake2s_update_nonce(&blake2_ctx, nonce);
+#endif
 
-	uint32_t hash[8];
-	gpu_blake2s_final(&blake2_ctx, hash);
-
-	if (hash[7] <= target2.x && hash[6] <= target2.y) {
+	uint2 h2 = gpu_blake2s_final(&blake2_ctx);
+	if (h2.y <= target2.y && h2.x <= target2.x) {
 #if NBN == 2
 		if (resNonce[0] != UINT32_MAX)
 			resNonce[1] = nonce;
@@ -363,8 +370,12 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
 	}
 }
 
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+	return iftrue ? swab32(val) : val;
+}
+
 __host__
-uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
+uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, const int swap)
 {
 	uint32_t result = UINT32_MAX;
 
@@ -375,13 +386,15 @@ uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint3
 	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake2s_gpu_hash <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], target2);
+	blake2s_gpu_hash <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], target2, swap);
 	cudaThreadSynchronize();
 
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		result = h_resNonce[thr_id][0];
+		result = swab32_if(h_resNonce[thr_id][0], swap);
+#if NBN > 1
 		for (int n=0; n < (NBN-1); n++)
-			extra_results[n] = h_resNonce[thr_id][n+1];
+			extra_results[n] = swab32_if(h_resNonce[thr_id][n+1], swap);
+#endif
 	}
 	return result;
 }
@@ -393,6 +406,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+	const int swap = 1; // to toggle nonce endian
 
 	const uint32_t first_nonce = pdata[19];
 
@@ -436,10 +450,10 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
 
 	blake2s_setBlock(endiandata, &s_midstate);
 
-	const uint2 target = make_uint2(ptarget[7], ptarget[6]);
+	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
 
 	do {
-		uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target);
+		uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target, swap);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
@@ -448,18 +462,18 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
 			uint32_t _ALIGN(A) vhashcpu[8];
 
 			//blake2s_hash(vhashcpu, endiandata);
-			le32enc(&endiandata[19], foundNonce);
+			endiandata[19] = swab32_if(foundNonce, swap);
 			blake2s_hash_end(vhashcpu, endiandata);
 
-			if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
+			if (vhashcpu[7] <= target.y && fulltest(vhashcpu, ptarget)) {
 				work_set_target_ratio(work, vhashcpu);
-				pdata[19] = work->nonces[0] = swab32(foundNonce);
+				pdata[19] = work->nonces[0] = swab32_if(foundNonce, !swap);
 #if NBN > 1
 				if (extra_results[0] != UINT32_MAX) {
-					le32enc(&endiandata[19], extra_results[0]);
+					endiandata[19] = swab32_if(extra_results[0], swap);
 					blake2s_hash_end(vhashcpu, endiandata);
-					if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
-						work->nonces[1] = swab32(extra_results[0]);
+					if (vhashcpu[7] <= target.y && fulltest(vhashcpu, ptarget)) {
+						work->nonces[1] = swab32_if(extra_results[0], !swap);
 						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) {
 							work_set_target_ratio(work, vhashcpu);
 							xchg(work->nonces[1], pdata[19]);
@@ -472,9 +486,6 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
 				return 1;
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				applog_hex(pdata, 80);
-				applog_hex(ptarget, 32);
-				applog_hex(vhashcpu, 32);
 			}
 		}
 
diff --git a/skein2.cpp b/skein2.cpp
index 6e3e5a7..7edb6f4 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -37,12 +37,17 @@ void skein2hash(void *output, const void *input)
 
 static bool init[MAX_GPUS] = { 0 };
 
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+	return iftrue ? swab32(val) : val;
+}
+
 int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	const int swap = 1; // to toggle nonce endian
 
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@@ -81,7 +86,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 		int order = 0;
 
 		// Hash with CUDA
-		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0);
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
@@ -91,7 +96,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 		{
 			uint32_t _ALIGN(64) vhash64[8];
 
-			endiandata[19] = foundNonce;
+			endiandata[19] = swab32_if(foundNonce, swap);
 			skein2hash(vhash64, endiandata);
 
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
@@ -102,14 +107,14 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 					if (!opt_quiet)
 						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));
 
-					endiandata[19] = secNonce;
+					endiandata[19] = swab32_if(secNonce, swap);
 					skein2hash(vhash64, endiandata);
 					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
 						work_set_target_ratio(work, vhash64);
-					pdata[21] = swab32(secNonce);
+					pdata[21] = swab32_if(secNonce, !swap);
 					res++;
 				}
-				pdata[19] = swab32(foundNonce);
+				pdata[19] = swab32_if(foundNonce, !swap);
 				return res;
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);