lyra2v2, bmw256 and cubehash256 cleanup + diff fix

9 years ago · 53cd591956
4 changed files with 161 additions and 217 deletions
--- a/Algo256/cuda_bmw256.cu
+++ b/Algo256/cuda_bmw256.cu
@ -3,28 +3,24 @@
 #include "cuda_helper.h"
 #undef SPH_ROTL32
 #define SPH_ROTL32 ROTL32
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 __constant__ uint32_t ZDH[16];
 static uint32_t *d_gnounce[MAX_GPUS];
 static uint32_t *d_GNonce[MAX_GPUS];
-__constant__ uint32_t pTarget[8];
+__constant__ uint64_t pTarget[8];
-#define shl(x, n)            ((x) << (n))
+
-#define shr(x, n)            ((x) >> (n))
+#define shl(x, n) ((x) << (n))
-//#define SHR(x, n) SHR2(x, n)
+#define shr(x, n) ((x) >> (n))
-//#define SHL(x, n) SHL2(x, n)
+
-
+#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
-#define ROTL32host(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
-// #define SPH_ROTL32 SPH_ROTL32
+#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
-#define ss0(x)  (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
-#define ss1(x)  (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
+#define ss4(x) (shr((x), 1) ^ (x))
-#define ss2(x)  (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss5(x) (shr((x), 2) ^ (x))
-#define ss3(x)  (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+
 #define ss4(x)  (shr((x), 1) ^ (x))
 #define ss5(x)  (shr((x), 2) ^ (x))
 #define rs1(x) SPH_ROTL32((x),  3)
 #define rs2(x) SPH_ROTL32((x),  7)
 #define rs3(x) SPH_ROTL32((x), 13)
@ -34,67 +30,71 @@ __constant__ uint32_t pTarget[8];
 #define rs7(x) SPH_ROTL32((x), 27)
 /* Message expansion function 1 */
-__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, uint32_t *H, uint32_t *Q)
+__forceinline__ __device__
 uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
 #undef SPH_ROTL32
 #define SPH_ROTL32 ROTL32
 	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
 		+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
 		+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
 		+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-#undef SPH_ROTL32
+			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
 			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
 }
 /* Message expansion function 2 */
-__forceinline__ __device__ uint32_t expand32_2(int i, uint32_t *M32, uint32_t *H, uint32_t *Q)
+__forceinline__ __device__
 uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
 #undef SPH_ROTL32
 #define SPH_ROTL32 ROTL32
 	return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
 		+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
 		+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
 		+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-#undef SPH_ROTL32
+			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
 			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
 }
-__forceinline__ __device__ void Compression256(uint32_t *  M32, uint32_t *  H)
+__forceinline__ __device__
 void Compression256(uint32_t *  M32)
 {
-#undef SPH_ROTL32
+	uint32_t Q[32], XL32, XH32;
-#define SPH_ROTL32 ROTL32
+
-	int i;
+	const uint32_t H[16] = {
-	uint32_t XL32, XH32, Q[32];
+		0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
-
+		0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
-
+		0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
-	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+		0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
-	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	};
-	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+
-	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[0]  = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
-	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[1]  = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
-	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[2]  = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9]  ^ H[9])  - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[3]  = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8]  ^ H[8])  - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
-	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[4]  = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9]  ^ H[9])  - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
-	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[5]  = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
+	Q[6]  = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3]  ^ H[3])  - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
-	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
+	Q[7]  = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5]  ^ H[5])  - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
-	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
+	Q[8]  = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6]  ^ H[6])  + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
-	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
+	Q[9]  = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6]  ^ H[6])  - (M32[7]  ^ H[7])  + (M32[14] ^ H[14]);
-	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4]  ^ H[4])  - (M32[7]  ^ H[7])  + (M32[15] ^ H[15]);
-	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2]  ^ H[2])  - (M32[5]  ^ H[5])  + (M32[9]  ^ H[9]);
-	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6]  ^ H[6])  - (M32[9]  ^ H[9])  + (M32[10] ^ H[10]);
-
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7]  ^ H[7])  + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
-	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8]  ^ H[8])  - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
-	Q[0] = ss0(Q[0]) + H[1];
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9]  ^ H[9])  + (M32[13] ^ H[13]);
-	Q[1] = ss1(Q[1]) + H[2];
+
-	Q[2] = ss2(Q[2]) + H[3];
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */
-	Q[3] = ss3(Q[3]) + H[4];
+	Q[0]  = ss0(Q[0])  + H[1];
-	Q[4] = ss4(Q[4]) + H[5];
+	Q[1]  = ss1(Q[1])  + H[2];
-	Q[5] = ss0(Q[5]) + H[6];
+	Q[2]  = ss2(Q[2])  + H[3];
-	Q[6] = ss1(Q[6]) + H[7];
+	Q[3]  = ss3(Q[3])  + H[4];
-	Q[7] = ss2(Q[7]) + H[8];
+	Q[4]  = ss4(Q[4])  + H[5];
-	Q[8] = ss3(Q[8]) + H[9];
+	Q[5]  = ss0(Q[5])  + H[6];
-	Q[9] = ss4(Q[9]) + H[10];
+	Q[6]  = ss1(Q[6])  + H[7];
 	Q[7]  = ss2(Q[7])  + H[8];
 	Q[8]  = ss3(Q[8])  + H[9];
 	Q[9]  = ss4(Q[9])  + H[10];
 	Q[10] = ss0(Q[10]) + H[11];
 	Q[11] = ss1(Q[11]) + H[12];
 	Q[12] = ss2(Q[12]) + H[13];
@ -109,11 +109,13 @@ __forceinline__ __device__ void Compression256(uint32_t *  M32, uint32_t *  H)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
-	for (i = 0; i<2; i++)
+	#pragma unroll
-		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+	for (int i=16; i<18; i++)
 		Q[i] = expand32_1(i, M32, H, Q);
-	for (i = 2; i<16; i++)
+	#pragma nounroll
-		Q[i + 16] = expand32_2(i + 16, M32, H, Q);
+	for (int i=18; i<32; i++)
 		Q[i] = expand32_2(i, M32, H, Q);
 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
@ -124,62 +126,55 @@ __forceinline__ __device__ void Compression256(uint32_t *  M32, uint32_t *  H)
 	/*  This part is the function f_2 - in the documentation            */
 	/*  Compute the double chaining pipe for the next message block.    */
-	H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32    ^ Q[24] ^ Q[0]);
+	M32[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32    ^ Q[24] ^ Q[0]);
-	H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32    ^ Q[25] ^ Q[1]);
+	M32[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32    ^ Q[25] ^ Q[1]);
-	H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
+	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
-	H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
+	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
-	H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32    ^ Q[28] ^ Q[4]);
+	M32[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32    ^ Q[28] ^ Q[4]);
-	H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32    ^ Q[29] ^ Q[5]);
+	M32[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32    ^ Q[29] ^ Q[5]);
-	H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32    ^ Q[30] ^ Q[6]);
+	M32[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32    ^ Q[30] ^ Q[6]);
-	H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32    ^ Q[31] ^ Q[7]);
+	M32[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32    ^ Q[31] ^ Q[7]);
-
+
-	H[8] = SPH_ROTL32(H[4], 9) + (XH32     ^     Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
+	M32[8] = SPH_ROTL32(M32[4], 9) + (XH32     ^     Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
-	H[9] = SPH_ROTL32(H[5], 10) + (XH32     ^     Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
+	M32[9] = SPH_ROTL32(M32[5], 10) + (XH32     ^     Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
-	H[10] = SPH_ROTL32(H[6], 11) + (XH32     ^     Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
+	M32[10] = SPH_ROTL32(M32[6], 11) + (XH32     ^     Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
-	H[11] = SPH_ROTL32(H[7], 12) + (XH32     ^     Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
+	M32[11] = SPH_ROTL32(M32[7], 12) + (XH32     ^     Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
-	H[12] = SPH_ROTL32(H[0], 13) + (XH32     ^     Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
+	M32[12] = SPH_ROTL32(M32[0], 13) + (XH32     ^     Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
-	H[13] = SPH_ROTL32(H[1], 14) + (XH32     ^     Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
+	M32[13] = SPH_ROTL32(M32[1], 14) + (XH32     ^     Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
-	H[14] = SPH_ROTL32(H[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
-	H[15] = SPH_ROTL32(H[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 #undef SPH_ROTL32
 }
-__forceinline__ __device__ void Compression256_2(uint32_t *  M32, uint32_t * H)
+__forceinline__ __device__
 void Compression256_2(uint32_t *  M32)
 {
 #undef SPH_ROTL32
 #define SPH_ROTL32 ROTL32
 	int i;
 	uint32_t XL32, XH32, Q[32];
-	/*  This part is the function f0 - in the documentation */
+	const uint32_t H[16] = {
-
+		0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
-	/*  First we mix the message block *M32 (M in the documatation)        */
+		0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
-	/*  with the previous double pipe *H.                                  */
+		0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
-	/*  For a fixed previous double pipe, or fixed message block, this     */
+		0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
-	/*  part is bijection.                                                 */
+	};
 	/*  This transformation diffuses every one bit difference in 5 words.  */
 	Q[0] = (H[5]) - (H[7]) + (H[10]) + (H[13]) + (0x280 ^ H[14]);
 	Q[1] = (H[6]) - (H[8]) + (H[11]) + (0x280 ^ H[14]) - (H[15]);
 	Q[2] = (M32[0] ^ H[0]) + (H[7]) + (H[9]) - (H[12]) + (H[15]);
 	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (H[8]) - (H[10]) + (H[13]);
 	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (H[9]) - (H[11]) - (0x280 ^ H[14]);
 	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (H[10]) - (H[12]) + (H[15]);
 	Q[6] = (0x80 ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (H[11]) + (H[13]);
 	Q[7] = (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[5]) - (H[12]) - (0x280 ^ H[14]);
 	Q[8] = (M32[2] ^ H[2]) - (H[5]) - (H[6]) + (H[13]) - (H[15]);
 	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (H[6]) - (H[7]) + (0x280 ^ H[14]);
 	Q[10] = (H[8]) - (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[7]) + (H[15]);
 	Q[11] = (H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (H[5]) + (H[9]);
 	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (H[6]) - (H[9]) + (H[10]);
 	Q[13] = (M32[2] ^ H[2]) + (0x80 ^ H[4]) + (H[7]) + (H[10]) + (H[11]);
 	Q[14] = (M32[3] ^ H[3]) - (H[5]) + (H[8]) - (H[11]) - (H[12]);
 	Q[15] = (H[12]) - (0x80 ^ H[4]) - (H[6]) - (H[9]) + (H[13]);
-	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
 	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
 	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
 	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
 	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
 	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
 	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
 	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
 	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
 	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
 	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
 	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
 	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
 	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
 	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
 	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
 	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
 	Q[0] = ss0(Q[0]) + H[1];
 	Q[1] = ss1(Q[1]) + H[2];
 	Q[2] = ss2(Q[2]) + H[3];
@ -204,41 +199,23 @@ __forceinline__ __device__ void Compression256_2(uint32_t *  M32, uint32_t * H)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
-	for (i = 0; i<2; i++)
+	#pragma unroll
-		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+	for (int i = 16; i<18; i++)
 		Q[i] = expand32_1(i, M32, H, Q);
-	for (i = 2; i<16; i++)
+	#pragma nounroll
-		Q[i + 16] = expand32_2(i + 16, M32, H, Q);
+	for (int i = 18; i<32; i++)
 		Q[i] = expand32_2(i, M32, H, Q);
 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
 	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
-	XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+	XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
 	/*  This part is the function f_2 - in the documentation            */
 	/*  Compute the double chaining pipe for the next message block.    */
 	H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32    ^ Q[24] ^ Q[0]);
 	H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32    ^ Q[25] ^ Q[1]);
 	H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
 	H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
 	H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32    ^ Q[28] ^ Q[4]);
 	H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32    ^ Q[29] ^ Q[5]);
 	H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32    ^ Q[30] ^ Q[6]);
 	H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32    ^ Q[31] ^ Q[7]);
 	H[8] = SPH_ROTL32(H[4], 9) + (XH32     ^     Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
 	H[9] = SPH_ROTL32(H[5], 10) + (XH32     ^     Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
 	H[10] = SPH_ROTL32(H[6], 11) + (XH32     ^     Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
 	H[11] = SPH_ROTL32(H[7], 12) + (XH32     ^     Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
 	H[12] = SPH_ROTL32(H[0], 13) + (XH32     ^     Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
 	H[13] = SPH_ROTL32(H[1], 14) + (XH32     ^     Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
 	H[14] = SPH_ROTL32(H[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
 	H[15] = SPH_ROTL32(H[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 #undef SPH_ROTL32
 	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
 	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
 	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
 	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 }
 #define TPB 512
@ -248,27 +225,8 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t dh[16] = {
+		uint32_t message[16] = { 0 };
-			(0x40414243), (0x44454647),
+
 			(0x48494A4B), (0x4C4D4E4F),
 			(0x50515253), (0x54555657),
 			(0x58595A5B), (0x5C5D5E5F),
 			(0x60616263), (0x64656667),
 			(0x68696A6B), (0x6C6D6E6F),
 			(0x70717273), (0x74757677),
 			(0x78797A7B), (0x7C7D7E7F)
 		};
 		uint32_t final_s[16] = {
 			(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
 			(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
 			(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
 			(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
 			(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
 			(0xaaaaaaaf)
 		};
 		uint32_t message[16]={0};
 		LOHI(message[0], message[1], __ldg(&g_hash[thread]));
 		LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
 		LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
@ -276,10 +234,10 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
 		message[8]=0x80;
 		message[14]=0x100;
-		Compression256(message, dh);
+		Compression256(message);
-		Compression256(dh, final_s);
+		Compression256_2(message);
-		if (((uint64_t*)final_s)[7] <= ((uint64_t*)pTarget)[3])
+		if (((uint64_t*)message)[7] <= pTarget[3])
 		{
 			uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
 			if (tmp != 0)
@ -288,17 +246,15 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
 	}
 }
 __host__
-void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash,uint32_t *resultnonces)
+void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
 {
 	cudaMemset(d_GNonce[thr_id], 0x0, 2 * sizeof(uint32_t));
 	const uint32_t threadsperblock = TPB;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));
 	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
 	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	resultnonces[0] = *(d_gnounce[thr_id]);
@ -316,5 +272,5 @@ void bmw256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void bmw256_setTarget(const void *pTargetIn)
 {
-	cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
 }
--- a/Algo256/cuda_cubehash256.cu
+++ b/Algo256/cuda_cubehash256.cu
@ -13,7 +13,8 @@
 #define ROTATEUPWARDS11(a) LROT(a,11)
 //#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;}
+#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; }
 __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
 {
 	int r;
@ -155,7 +156,8 @@ __device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2]
 }
-void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
+__device__ __forceinline__
 void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
 {
 	/* "xor the block into the first b bytes of the state" */
 	/* "and then transform the state invertibly through r identical rounds" */
@ -163,24 +165,22 @@ void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32
 	rrounds(x);
 }
-void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
 void Update32_const(uint32_t x[2][2][2][2][2])
 {
 	x[0][0][0][0][0] ^= 0x80;
 	rrounds(x);
 }
-
+__device__ __forceinline__
-
+void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 {
 	int i;
 	/* "the integer 1 is xored into the last state word x_11111" */
-	x[1][1][1][1][1] ^= 1;
+	x[1][1][1][1][1] ^= 1U;
 	/* "the state is then transformed invertibly through 10r identical rounds" */
 	#pragma unroll 2
-	for (i = 0; i < 10; ++i) rrounds(x);
+	for (int i = 0; i < 10; ++i) rrounds(x);
 	/* "output the first h/8 bytes of the state" */
 	hash_fromx(hashval, x);
@ -198,8 +198,8 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        uint32_t Hash[8]; // = &g_hash[16 * hashPosition];
 		LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread]));
 		LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
 		LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
@ -207,19 +207,16 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
 		uint32_t x[2][2][2][2][2] =
 		{
-			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71,
+			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71,	0x35481EAE,
-			0x35481EAE, 0x22512D5B, 0xE5D94E63,
+			0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
-			0x7E624131, 0xF4CC12BE, 0xC2D0B696,
+			0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
-			0x42AF2070, 0xD0720C35, 0x3361DA8C,
+			0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
-			0x28CCECA4, 0x8EF8AD83, 0x4680AC00,
+			0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
-			0x40E5FBAB, 0xD89041C3, 0x6107FBD5,
+			0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
-			0x6C859D41, 0xF0B26679, 0x09392549,
+			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
-			0x5FA25603, 0x65C892FD, 0x93CB6285,
+			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
 			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD,
 			0x85254725, 0x15815AEB, 0x4AB6AAD6,
 			0x9CDAF8AF, 0xD6032C0A
 		};
 		x[0][0][0][0][0] ^= Hash[0];
 		x[0][0][0][0][1] ^= Hash[1];
 		x[0][0][0][1][0] ^= Hash[2];
@ -230,7 +227,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
 		x[0][0][1][1][1] ^= Hash[7];
 		rrounds(x);
-		x[0][0][0][0][0] ^= 0x80;
+		x[0][0][0][0][0] ^= 0x80U;
 		rrounds(x);
 		Final(x, Hash);
@ -244,17 +241,12 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
 __host__
-void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash)
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
 {
 	uint32_t tpb = 576;
 	uint32_t tpb;
 	if (device_sm[device_map[thr_id]]<500)
 		tpb = 576;
 	else
 		tpb = 576;
    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid((threads + tpb-1)/tpb);
    dim3 block(tpb);
-    cubehash256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_hash);
+    cubehash256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_hash);
 }
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -1457,11 +1457,11 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
 		case ALGO_LYRA2v2:
 			diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
 		case ALGO_LYRA2:
 		case ALGO_LYRA2v2:
 			diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
 			break;
 		default:
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -21,6 +21,7 @@ extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNo
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
 extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* matrix);
@ -29,8 +30,6 @@ extern void bmw256_setTarget(const void *ptarget);
 extern void bmw256_cpu_init(int thr_id, uint32_t threads);
 extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash);
 void lyra2v2_hash(void *state, const void *input)
 {
 	uint32_t hashA[8], hashB[8];
@ -117,16 +116,15 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id]);
+		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
 		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
 		if (foundNonces[0] != 0)
 		{
 //			CUDA_SAFE_CALL(cudaGetLastError());
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonces[0]);
@ -134,16 +132,14 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				// check if there was some other ones...
+				// check if there was another one...
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (foundNonces[1] != 0)
 				{
 					pdata[21] = foundNonces[1];
 					res++;
 					if (opt_benchmark)  applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonces[1], vhash64[7], Htarg);
 				}
 				pdata[19] = foundNonces[0];
 				if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonces[0], vhash64[7], Htarg);
 				MyStreamSynchronize(NULL, 0, device_map[thr_id]);
 				return res;
 			}
@ -156,7 +152,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 		pdata[19] += throughput;
-	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+	} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
 	*hashes_done = pdata[19] - first_nonce + 1;
 	MyStreamSynchronize(NULL, 0, device_map[thr_id]);