m7: forgot to indent some cuda files

and remove unused variables...
10 years ago · fcd381cda2
6 changed files with 679 additions and 725 deletions
--- a/m7/cuda_m7_sha256.cu
+++ b/m7/cuda_m7_sha256.cu
@ -1,10 +1,10 @@
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 #include "sph/sph_types.h"
 #include "cuda_helper.h"
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 //#define SPH_C64(x)    ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
 };
-static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x)
+__device__ __forceinline__
 static uint32_t bsg2_0(uint32_t x)
 {
 	uint32_t r1 = SPH_ROTR32(x,2);
 	uint32_t r2 = SPH_ROTR32(x,13);
 	uint32_t r3 = SPH_ROTR32(x,22);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t bsg2_1(uint32_t x)
 {
 	uint32_t r1 = SPH_ROTR32(x,6);
 	uint32_t r2 = SPH_ROTR32(x,11);
 	uint32_t r3 = SPH_ROTR32(x,25);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t ssg2_0(uint32_t x)
 {
 	uint64_t r1 = SPH_ROTR32(x,7);
 	uint64_t r2 = SPH_ROTR32(x,18);
 	uint64_t r3 = shr_t32(x,3);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t ssg2_1(uint32_t x)
 {
 	uint64_t r1 = SPH_ROTR32(x,17);
 	uint64_t r2 = SPH_ROTR32(x,19);
@ -96,133 +103,130 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__device__ __forceinline__
-	                                              uint32_t in,const uint32_t Kshared)
+static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                        uint32_t in,const uint32_t Kshared)
 {
-uint32_t t1,t2;
+	uint32_t t1,t2;
-uint32_t vxandx = xandx(e, f, g);
+	uint32_t vxandx = xandx(e, f, g);
-uint32_t bsg21 =bsg2_1(e);
+	uint32_t bsg21 =bsg2_1(e);
-uint32_t bsg20 =bsg2_0(a);
+	uint32_t bsg20 =bsg2_0(a);
-uint32_t andorv =andor32(a,b,c);
+	uint32_t andorv =andor32(a,b,c);
-
+
-t1 = h + bsg21 + vxandx + Kshared + in;
+	t1 = h + bsg21 + vxandx + Kshared + in;
-t2 = bsg20 + andorv;
+	t2 = bsg20 + andorv;
-d = d + t1;
+	d = d + t1;
-h = t1 + t2;
+	h = t1 + t2;
 }
-static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__host__ __forceinline__
-	                                              uint32_t in,const uint32_t Kshared)
+static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                            uint32_t in,const uint32_t Kshared)
 {
-
+	uint32_t t1,t2;
-
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
-
+	uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
-uint32_t t1,t2;
+	uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
-uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
-uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+
-uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	t1 = h + bsg21 + vxandx + Kshared + in;
-uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+	t2 = bsg20 + andorv;
-
+	d = d + t1;
-t1 = h + bsg21 + vxandx + Kshared + in;
+	h = t1 + t2;
 t2 = bsg20 + andorv;
 d = d + t1;
 h = t1 + t2;
 }
-static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__device__ __forceinline__
-	                                              uint32_t* in,uint32_t pc,const uint32_t Kshared)
+static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                        uint32_t* in,uint32_t pc,const uint32_t Kshared)
 {
-uint32_t t1,t2;
+	uint32_t t1,t2;
-
+
-int pcidx1 = (pc-2) & 0xF;
+	int pcidx1 = (pc-2) & 0xF;
-int pcidx2 = (pc-7) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
-int pcidx3 = (pc-15) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
-uint32_t inx0 = in[pc];
+	uint32_t inx0 = in[pc];
-uint32_t inx1 = in[pcidx1];
+	uint32_t inx1 = in[pcidx1];
-uint32_t inx2 = in[pcidx2];
+	uint32_t inx2 = in[pcidx2];
-uint32_t inx3 = in[pcidx3];
+	uint32_t inx3 = in[pcidx3];
-
+
-
+
-uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg21 = ssg2_1(inx1);
-uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t ssg20 = ssg2_0(inx3);
-uint32_t vxandx = xandx(e, f, g);
+	uint32_t vxandx = xandx(e, f, g);
-uint32_t bsg21 =bsg2_1(e);
+	uint32_t bsg21 =bsg2_1(e);
-uint32_t bsg20 =bsg2_0(a);
+	uint32_t bsg20 =bsg2_0(a);
-uint32_t andorv =andor32(a,b,c);
+	uint32_t andorv =andor32(a,b,c);
-
+
-in[pc] = ssg21+inx2+ssg20+inx0;
+	in[pc] = ssg21+inx2+ssg20+inx0;
-
+
-t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
-t2 = bsg20 + andorv;
+	t2 = bsg20 + andorv;
-d =  d + t1;
+	d =  d + t1;
-h = t1 + t2;
+	h = t1 + t2;
 }
-static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__host__ __forceinline__
-	                                              uint32_t* in,uint32_t pc,const uint32_t Kshared)
+static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                            uint32_t* in,uint32_t pc,const uint32_t Kshared)
 {
-uint32_t t1,t2;
+	uint32_t t1,t2;
-
+
-int pcidx1 = (pc-2) & 0xF;
+	int pcidx1 = (pc-2) & 0xF;
-int pcidx2 = (pc-7) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
-int pcidx3 = (pc-15) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
-uint32_t inx0 = in[pc];
+	uint32_t inx0 = in[pc];
-uint32_t inx1 = in[pcidx1];
+	uint32_t inx1 = in[pcidx1];
-uint32_t inx2 = in[pcidx2];
+	uint32_t inx2 = in[pcidx2];
-uint32_t inx3 = in[pcidx3];
+	uint32_t inx3 = in[pcidx3];
-
+
-
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
-uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
-uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
-uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
-uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
-uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
-uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
-
+	in[pc] = ssg21+inx2+ssg20+inx0;
-in[pc] = ssg21+inx2+ssg20+inx0;
+
-
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
-t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
-t2 = bsg20 + andorv;
+	d =  d + t1;
-d =  d + t1;
+	h = t1 + t2;
 h = t1 + t2;
 }
-static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
+__device__ __forceinline__
 static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
 {
-
+	uint32_t a=r[0];
-
+	uint32_t b=r[1];
-		uint32_t a=r[0];
+	uint32_t c=r[2];
-        uint32_t b=r[1];
+	uint32_t d=r[3];
-        uint32_t c=r[2];
+	uint32_t e=r[4];
-        uint32_t d=r[3];
+	uint32_t f=r[5];
-        uint32_t e=r[4];
+	uint32_t g=r[6];
-        uint32_t f=r[5];
+	uint32_t h=r[7];
-        uint32_t g=r[6];
+
-        uint32_t h=r[7];
+	sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
-
+	sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
-		sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
-		sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
-		sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
-		sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
-		sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
-		sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
-		sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
-		sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
-		sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
-		sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
-		sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
-		sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
-		sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
-		sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
-		sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+
-		sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+	#pragma unroll 3
-
+	for (int i=0;i<3;i++)
-#pragma unroll 3
+	{
 		for (int i=0;i<3;i++) {
 		sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
 		sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
 		sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,54 +243,50 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
 		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
-		}
+	r[0] = r[0] + a;
-
+	r[1] = r[1] + b;
-
+	r[2] = r[2] + c;
-
+	r[3] = r[3] + d;
-		 r[0] = r[0] + a;
+	r[4] = r[4] + e;
-		 r[1] = r[1] + b;
+	r[5] = r[5] + f;
-		 r[2] = r[2] + c;
+	r[6] = r[6] + g;
-		 r[3] = r[3] + d;
+	r[7] = r[7] + h;
 		 r[4] = r[4] + e;
 		 r[5] = r[5] + f;
 		 r[6] = r[6] + g;
 		 r[7] = r[7] + h;
 }
-static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
+__forceinline__
 static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
 {
-
+	uint32_t a=r[0];
-
+	uint32_t b=r[1];
-		uint32_t a=r[0];
+	uint32_t c=r[2];
-        uint32_t b=r[1];
+	uint32_t d=r[3];
-        uint32_t c=r[2];
+	uint32_t e=r[4];
-        uint32_t d=r[3];
+	uint32_t f=r[5];
-        uint32_t e=r[4];
+	uint32_t g=r[6];
-        uint32_t f=r[5];
+	uint32_t h=r[7];
-        uint32_t g=r[6];
+
-        uint32_t h=r[7];
+	sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
-
+	sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
-		sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
+	sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
-		sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
+	sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
-		sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
+	sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
-		sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
+	sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
-		sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
+	sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
-		sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
+	sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
-		sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
+	sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
-		sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
+	sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
-		sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
+	sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
-		sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
+	sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
-		sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
-		sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
-		sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
-		sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
-		sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+
-		sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+	#pragma unroll 3
-
+	for (int i=0;i<3;i++)
-#pragma unroll 3
+	{
 		for (int i=0;i<3;i++) {
 		sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
 		sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
 		sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,100 +303,64 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
 		sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
-		}
+	r[0] = r[0] + a;
-
+	r[1] = r[1] + b;
-		 r[0] = r[0] + a;
+	r[2] = r[2] + c;
-		 r[1] = r[1] + b;
+	r[3] = r[3] + d;
-		 r[2] = r[2] + c;
+	r[4] = r[4] + e;
-		 r[3] = r[3] + d;
+	r[5] = r[5] + f;
-		 r[4] = r[4] + e;
+	r[6] = r[6] + g;
-		 r[5] = r[5] + f;
+	r[7] = r[7] + h;
 		 r[6] = r[6] + g;
 		 r[7] = r[7] + h;
 }
-
+__global__
-__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
-/*
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	__shared__ uint32_t Kshared[64];
+	if (thread < threads)
-	if (threadIdx.x < 64) {
+	{
 		Kshared[threadIdx.x]=K[threadIdx.x];
 	}
 	__syncthreads();
 */
 union {
 uint8_t h1[64];
 uint32_t h4[16];
 uint64_t h8[8];
 } hash;
 //uint32_t buf[8];
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
 		uint32_t nounce = startNounce +  thread ; // original implementation
-        uint32_t buf[8];
+		uint32_t buf[8];
 		uint32_t in2[16]={0};
 		uint32_t in3[16]={0};
-        #pragma unroll 13
+		#pragma unroll 13
-		for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);}
+		for (int i=0; i<13; i++)
 			in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
 		in2[13]=cuda_swab32(nounce);
 		in2[14]=cuda_swab32(c_PaddedMessage80[30]);
-		                        in3[15]=0x3d0;
+		in3[15]=0x3d0;
        #pragma unroll 8
 		for (int i=0;i<8;i++) {buf[i]= pbuf[i];}
                    sha2_round_body(in2,buf,K);
 					sha2_round_body(in3,buf,K);
 //#pragma unroll 8
 //for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
 #pragma unroll 4
 for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
 		#pragma unroll 8
 		for (int i=0; i<8; i++)
 			buf[i] = pbuf[i];
-//////////////////////////////////////////////////////////////////////////////////////////////////
+		sha2_round_body(in2,buf,K);
-	} // threads
+		sha2_round_body(in3,buf,K);
 		#pragma unroll 4
 		for (int i=0; i<4; i++) {
 			outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
 		}
 	} // thread
 }
-
+__global__
-__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
+void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
 {
-/*
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	__shared__ uint32_t Kshared[64];
+	if (thread < threads)
-	if (threadIdx.x < 64) {
+	{
-		Kshared[threadIdx.x]=K[threadIdx.x];
+		uint32_t in[16],buf[8];
 	}
 	__syncthreads();
 */
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
 union {
 uint8_t h1[304];
 uint32_t h4[76];
 uint64_t h8[38];
 } hash;
        uint32_t in[16],buf[8];
 		#pragma unroll 8
 		for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
-        #pragma unroll 8
+		#pragma unroll 8
 		for (int i=0;i<8;i++) {buf[i] = H256[i];}
 		sha2_round_body(in,buf,K);
@ -415,96 +379,87 @@ uint64_t h8[38];
 		#pragma unroll 5
 		for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
-		((uint64_t*)in)[5]= g_hash1[threads*(5+32)+thread];
+
 		((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
 		in[11]=0;
 		in[12]=0;
 		in[13]=0;
 		in[14]=0;
 		in[15]=0x968;
-                   in[15]=0x968;
+		int it=0;
-                   int it=0;
+		do {
-				   do {
+			in[15]-=8;
-                   in[15]-=8;
+			it++;
-				   it++;
+		}  while (((uint8_t*)in)[44-it]==0);
 				   }  while (((uint8_t*)in)[44-it]==0);
 				   ((uint8_t*)in)[44-it+1]=0x80;
-           ((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
+		((uint8_t*)in)[44-it+1]=0x80;
-				   sha2_round_body(in,buf,K);
+		((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
-uint32_t nounce = startNounce +thread;
+		sha2_round_body(in,buf,K);
 		bool rc = false;
 		uint32_t nounce = startNounce +thread;
 		bool rc = false;
 		#pragma unroll 4
 		for (int i = 0; i < 4; i++)
 		{
 			if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
-				if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc =  false;}
+				if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
-//			if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
+					rc = true;
 				else
 					rc = false;
 				//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
 			}
 		}
-
+		if (rc && resNounce[0] > nounce)
-		if(rc == true)
+			resNounce[0] = nounce;
-		{
+	} // thread
 			if(resNounce[0] > nounce)
 				resNounce[0] = nounce;
 		}
 ////
 	} // threads
 }
-
+__host__
-
+void m7_sha256_cpu_init(int thr_id, int threads)
 __host__ void m7_sha256_cpu_init(int thr_id, int threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol(	H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
 	cudaMemcpyToSymbol(	K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
 	cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
 	cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
 }
-
+__host__
-__host__  uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
+uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
 {
-
+	const int threadsperblock = 384;
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
 	cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
 	cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	MyStreamSynchronize(NULL, order, thr_id);
 	result = *d_mnounce[thr_id];
 	return result;
 }
-
+__host__
-__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 512;
 	const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-//	dim3 grid(1);
+
 //	dim3 block(1);
 	size_t shared_size = 0;
 	m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget)  //not useful
+__host__
 void m7_sha256_setBlock_120(void *pdata,const void *ptarget)  //not useful
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x80;
@ -527,6 +483,6 @@ __host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget)  //not use
 	uint32_t in[16],buf[8];
 	for (int i=0;i<16;i++) {in[i]= host_swab32(alt_data[i]);}
 	for (int i=0;i<8;i++) {buf[i]= cpu_H256[i];}
-			                sha2_round_body_host(in,buf,cpu_K);
+	sha2_round_body_host(in,buf,cpu_K);
-    cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
--- a/m7/cuda_mul2.cu
+++ b/m7/cuda_mul2.cu
@ -52,29 +52,29 @@ typedef struct t4_t{
 } t4_t;
 __device__ __forceinline__
-ulonglong2 umul64wide (unsigned long long int a,
+ulonglong2 umul64wide(unsigned long long int a,
-                       unsigned long long int b)
+                        unsigned long long int b)
 {
-    ulonglong2 res;
+	ulonglong2 res;
-    asm ("{\n\t"
+	asm ("{\n\t"
-         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
+			".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
-         "mov.b64         {alo,ahi}, %2;   \n\t"
+			"mov.b64         {alo,ahi}, %2;   \n\t"
-         "mov.b64         {blo,bhi}, %3;   \n\t"
+			"mov.b64         {blo,bhi}, %3;   \n\t"
-         "mul.lo.u32      r0, alo, blo;    \n\t"
+			"mul.lo.u32      r0, alo, blo;    \n\t"
-         "mul.hi.u32      r1, alo, blo;    \n\t"
+			"mul.hi.u32      r1, alo, blo;    \n\t"
-         "mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
+			"mad.lo.cc.u32   r1, alo, bhi, r1;\n\t"
-         "madc.hi.u32     r2, alo, bhi,  0;\n\t"
+			"madc.hi.u32     r2, alo, bhi,  0;\n\t"
-         "mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
+			"mad.lo.cc.u32   r1, ahi, blo, r1;\n\t"
-         "madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
+			"madc.hi.cc.u32  r2, ahi, blo, r2;\n\t"
-         "madc.hi.u32     r3, ahi, bhi,  0;\n\t"
+			"madc.hi.u32     r3, ahi, bhi,  0;\n\t"
-         "mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
+			"mad.lo.cc.u32   r2, ahi, bhi, r2;\n\t"
-         "addc.u32        r3, r3,  0;      \n\t"
+			"addc.u32        r3, r3,  0;      \n\t"
-         "mov.b64         %0, {r0,r1};     \n\t"
+			"mov.b64         %0, {r0,r1};     \n\t"
-         "mov.b64         %1, {r2,r3};     \n\t"
+			"mov.b64         %1, {r2,r3};     \n\t"
-         "}"
+			"}"
-         : "=l"(res.x), "=l"(res.y)
+			: "=l"(res.x), "=l"(res.y)
-         : "l"(a), "l"(b));
+			: "l"(a), "l"(b));
-    return res;
+	return res;
 }
 #define umul_ppmm(h,l,m,n) \
@ -85,75 +85,76 @@ ulonglong2 umul64wide (unsigned long long int a,
 }
-__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
+__device__ __forceinline__
 void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
 {
-    asm ("{\n\t"
+	asm ("{\n\t"
-         ".reg .u32 o0, o1, o2, o3, o4;    \n\t"
+		".reg .u32 o0, o1, o2, o3, o4;    \n\t"
-         ".reg .u32 o5, o6, o7, i8, i9;    \n\t"
+		".reg .u32 o5, o6, o7, i8, i9;    \n\t"
-         ".reg .u32 i10, i11, i12, i13;    \n\t"
+		".reg .u32 i10, i11, i12, i13;    \n\t"
-         ".reg .u32 i14, i15, i16, i17;    \n\t"
+		".reg .u32 i14, i15, i16, i17;    \n\t"
-         ".reg .u32 i18, i19, i20, i21;    \n\t"
+		".reg .u32 i18, i19, i20, i21;    \n\t"
-         ".reg .u32 i22, i23;              \n\t"
+		".reg .u32 i22, i23;              \n\t"
-         "mov.b64         { i8, i9}, %4;   \n\t"
+		"mov.b64         { i8, i9}, %4;   \n\t"
-         "mov.b64         {i10,i11}, %5;   \n\t"
+		"mov.b64         {i10,i11}, %5;   \n\t"
-         "mov.b64         {i12,i13}, %6;   \n\t"
+		"mov.b64         {i12,i13}, %6;   \n\t"
-         "mov.b64         {i14,i15}, %7;   \n\t"
+		"mov.b64         {i14,i15}, %7;   \n\t"
-         "mov.b64         {i16,i17}, %8;   \n\t"
+		"mov.b64         {i16,i17}, %8;   \n\t"
-         "mov.b64         {i18,i19}, %9;   \n\t"
+		"mov.b64         {i18,i19}, %9;   \n\t"
-         "mov.b64         {i20,i21},%10;   \n\t"
+		"mov.b64         {i20,i21},%10;   \n\t"
-         "mov.b64         {i22,i23},%11;   \n\t"
+		"mov.b64         {i22,i23},%11;   \n\t"
-         "mul.lo.u32      o0,  i8, i16;    \n\t"
+		"mul.lo.u32      o0,  i8, i16;    \n\t"
-         "mul.hi.u32      o1,  i8, i16;    \n\t"
+		"mul.hi.u32      o1,  i8, i16;    \n\t"
-         "mad.lo.cc.u32   o1,  i8, i17, o1;\n\t"
+		"mad.lo.cc.u32   o1,  i8, i17, o1;\n\t"
-         "madc.hi.u32     o2,  i8, i17,  0;\n\t"
+		"madc.hi.u32     o2,  i8, i17,  0;\n\t"
-         "mad.lo.cc.u32   o1,  i9, i16, o1;\n\t"
+		"mad.lo.cc.u32   o1,  i9, i16, o1;\n\t"
-         "madc.hi.cc.u32  o2,  i9, i16, o2;\n\t"
+		"madc.hi.cc.u32  o2,  i9, i16, o2;\n\t"
-         "madc.hi.u32     o3,  i8, i18,  0;\n\t"
+		"madc.hi.u32     o3,  i8, i18,  0;\n\t"
-         "mad.lo.cc.u32   o2,  i8, i18, o2;\n\t"
+		"mad.lo.cc.u32   o2,  i8, i18, o2;\n\t"
-         "madc.hi.cc.u32  o3,  i9, i17, o3;\n\t"
+		"madc.hi.cc.u32  o3,  i9, i17, o3;\n\t"
-         "madc.hi.u32     o4,  i8, i19,  0;\n\t"
+		"madc.hi.u32     o4,  i8, i19,  0;\n\t"
-         "mad.lo.cc.u32   o2,  i9, i17, o2;\n\t"
+		"mad.lo.cc.u32   o2,  i9, i17, o2;\n\t"
-         "madc.hi.cc.u32  o3, i10, i16, o3;\n\t"
+		"madc.hi.cc.u32  o3, i10, i16, o3;\n\t"
-         "madc.hi.cc.u32  o4,  i9, i18, o4;\n\t"
+		"madc.hi.cc.u32  o4,  i9, i18, o4;\n\t"
-         "addc.u32        o5,   0,   0;\n\t"
+		"addc.u32        o5,   0,   0;\n\t"
-         "mad.lo.cc.u32   o2, i10, i16, o2;\n\t"
+		"mad.lo.cc.u32   o2, i10, i16, o2;\n\t"
-	 "madc.lo.cc.u32  o3,  i8, i19, o3;\n\t"
+		"madc.lo.cc.u32  o3,  i8, i19, o3;\n\t"
-         "madc.hi.cc.u32  o4, i10, i17, o4;\n\t"
+		"madc.hi.cc.u32  o4, i10, i17, o4;\n\t"
-         "madc.hi.cc.u32  o5,  i9, i19, o5;\n\t"
+		"madc.hi.cc.u32  o5,  i9, i19, o5;\n\t"
-         "addc.u32        o6,   0,   0;\n\t"
+		"addc.u32        o6,   0,   0;\n\t"
-         "mad.lo.cc.u32   o3,  i9, i18, o3;\n\t"
+		"mad.lo.cc.u32   o3,  i9, i18, o3;\n\t"
-         "madc.hi.cc.u32  o4, i11, i16, o4;\n\t"
+		"madc.hi.cc.u32  o4, i11, i16, o4;\n\t"
-         "madc.hi.cc.u32  o5, i10, i18, o5;\n\t"
+		"madc.hi.cc.u32  o5, i10, i18, o5;\n\t"
-         "addc.u32        o6,   0,  o6;\n\t"
+		"addc.u32        o6,   0,  o6;\n\t"
-         "mad.lo.cc.u32   o3, i10, i17, o3;\n\t"
+		"mad.lo.cc.u32   o3, i10, i17, o3;\n\t"
-         "addc.u32        o4,   0,  o4;\n\t"
+		"addc.u32        o4,   0,  o4;\n\t"
-         "mad.hi.cc.u32   o5, i11, i17, o5;\n\t"
+		"mad.hi.cc.u32   o5, i11, i17, o5;\n\t"
-         "madc.hi.cc.u32  o6, i10, i19, o6;\n\t"
+		"madc.hi.cc.u32  o6, i10, i19, o6;\n\t"
-         "addc.u32        o7,   0,   0;\n\t"
+		"addc.u32        o7,   0,   0;\n\t"
-         "mad.lo.cc.u32   o3, i11, i16, o3;\n\t"
+		"mad.lo.cc.u32   o3, i11, i16, o3;\n\t"
-         "madc.lo.cc.u32  o4,  i9, i19, o4;\n\t"
+		"madc.lo.cc.u32  o4,  i9, i19, o4;\n\t"
-         "addc.u32        o5,   0,  o5;\n\t"
+		"addc.u32        o5,   0,  o5;\n\t"
-         "mad.hi.cc.u32   o6, i11, i18, o6;\n\t"
+		"mad.hi.cc.u32   o6, i11, i18, o6;\n\t"
-         "addc.u32        o7,   0,  o7;\n\t"
+		"addc.u32        o7,   0,  o7;\n\t"
-         "mad.lo.cc.u32   o4, i10, i18, o4;\n\t"
+		"mad.lo.cc.u32   o4, i10, i18, o4;\n\t"
-         "addc.u32        o5,   0,  o5;\n\t"
+		"addc.u32        o5,   0,  o5;\n\t"
-         "mad.hi.u32      o7, i11, i19, o7;\n\t"
+		"mad.hi.u32      o7, i11, i19, o7;\n\t"
-         "mad.lo.cc.u32   o4, i11, i17, o4;\n\t"
+		"mad.lo.cc.u32   o4, i11, i17, o4;\n\t"
-         "addc.u32        o5,   0,  o5;\n\t"
+		"addc.u32        o5,   0,  o5;\n\t"
-         "mad.lo.cc.u32   o5, i10, i19, o5;\n\t"
+		"mad.lo.cc.u32   o5, i10, i19, o5;\n\t"
-         "addc.u32        o6,   0,  o6;\n\t"
+		"addc.u32        o6,   0,  o6;\n\t"
-         "mad.lo.cc.u32   o5, i11, i18, o5;\n\t"
+		"mad.lo.cc.u32   o5, i11, i18, o5;\n\t"
-         "addc.u32        o6,   0,  o6;\n\t"
+		"addc.u32        o6,   0,  o6;\n\t"
-         "mad.lo.cc.u32   o6, i11, i19, o6;\n\t"
+		"mad.lo.cc.u32   o6, i11, i19, o6;\n\t"
-         "addc.u32        o7,   0,  o7;\n\t"
+		"addc.u32        o7,   0,  o7;\n\t"
-         "mov.b64         %0, {o0,o1};     \n\t"
+		"mov.b64         %0, {o0,o1};     \n\t"
-         "mov.b64         %1, {o2,o3};     \n\t"
+		"mov.b64         %1, {o2,o3};     \n\t"
-         "mov.b64         %2, {o4,o5};     \n\t"
+		"mov.b64         %2, {o4,o5};     \n\t"
-         "mov.b64         %3, {o6,o7};     \n\t"
+		"mov.b64         %3, {o6,o7};     \n\t"
-         "}"
+		"}"
-         : "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high)
+		: "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high)
-         : "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL),
+		: "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL),
-           "l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL));
+			"l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL));
 }
 #if 0
@ -187,55 +188,60 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
 #endif
-__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
+__device__ __forceinline__
 t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
 	t4_t ret;
 	ret.high = g[(idx*2 + 1)*threads + thread];
 	ret.low = g[(idx*2)*threads + thread];
 	if(thread==0){
-	//	cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low);
+	//  cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low);
 	}
 	return ret;
 }
-__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
+__device__ __forceinline__
 void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
 	g[(idx*2 + 1)*threads + thread]=val.high;
 	g[(idx*2)*threads + thread]=val.low;
 	if(thread==0){
-	//	cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low);
+	//  cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low);
 	}
 }
-__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){
+__device__ __forceinline__
 void T4_set(t4_t *d, uint64_t v){
 	d->high = 0;
 	d->low = v;
 }
-__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
+__device__ __forceinline__
 t4_t T4_add(t4_t a, t4_t b){
 	t4_t ret;
 	uint32_t c=0;
 	ret.low = a.low + b.low;
 	if(ret.low < a.low)
-	    c=1;
+			c=1;
 	ret.high = a.high + b.high + c;
 	return ret;
 }
-__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
+__device__ __forceinline__
 t4_t T4_add(uint64_t a, t4_t b){
 	t4_t ret;
 	uint32_t c=0;
 	ret.low = a + b.low;
 	if(ret.low < a)
-	    c=1;
+			c=1;
 	ret.high = b.high + c;
 	return ret;
 }
-
+__device__ __forceinline__
-__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
+uint32_t T4_lt(t4_t a, t4_t b){
 	if(a.high < b.high)
 		return 1;
 	if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
 	return 0;
 }
-__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
+__device__ __forceinline__
 uint32_t T4_gt(t4_t a, uint64_t b){
 	if(a.high)
 		return 1;
 	if(a.low > b)
@ -252,217 +259,213 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
 }
-__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
+__device__
-  t4_t ul, cl, hpl, lpl;
+void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
-  uint32_t i;
+	t4_t ul, cl, hpl, lpl;
-  T4_set(&cl,0);
+	uint32_t i;
-  for(i=0; i < len; i++) {
+	T4_set(&cl,0);
-      ul = T4(thread,threads,i,g_v);
+	for(i=0; i < len; i++) {
-      umul_ppmmT4 (&hpl, &lpl, ul, sml);
+		ul = T4(thread,threads,i,g_v);
 		umul_ppmmT4 (&hpl, &lpl, ul, sml);
-      lpl = T4_add(lpl,cl);
+		lpl = T4_add(lpl,cl);
-      cl = T4_add(T4_lt(lpl,cl),hpl);
+		cl = T4_add(T4_lt(lpl,cl),hpl);
-      T4_store(thread,threads,i,g_p,lpl);
+		T4_store(thread,threads,i,g_p,lpl);
-    }
+	}
-    T4_store(thread,threads,len,g_p,cl);
+	T4_store(thread,threads,len,g_p,cl);
-    *size = len + T4_gt(cl,0);
+	*size = len + T4_gt(cl,0);
 }
 __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, uint64_t sml, uint32_t *size){
-  uint64_t ul, cl, hpl, lpl;
+	uint64_t ul, cl, hpl, lpl;
-  uint32_t i;
+	uint32_t i;
-  cl = 0;
+	cl = 0;
-  for(i=0; i < len; i++) {
+	for(i=0; i < len; i++) {
-      ul = g_v[i*threads + thread];
+		ul = g_v[i*threads + thread];
-      umul_ppmm (hpl, lpl, ul, sml);
+		umul_ppmm (hpl, lpl, ul, sml);
-      lpl += cl;
+		lpl += cl;
-      cl = (lpl < cl) + hpl;
+		cl = (lpl < cl) + hpl;
-      g_p[i*threads + thread] = lpl;
+		g_p[i*threads + thread] = lpl;
-    }
+	}
-    g_p[len*threads + thread] = cl;
+	g_p[len*threads + thread] = cl;
-    *size = len + (cl != 0);
+	*size = len + (cl != 0);
 }
-uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){
+uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
 {
 	uint64_t carry=0;
 	uint32_t i;
 	uint64_t ul,lpl,hpl,rl;
-	for(i=0; i < xsz; i++){
+	for(i=0; i < xsz; i++)
 	{
 		ul = x[i*threads + thread];
 		umul_ppmm (hpl, lpl, ul, a);
-      		ul = x[i*threads + thread];
+		lpl += carry;
-      		umul_ppmm (hpl, lpl, ul, a);
+		carry = (lpl < carry) + hpl;
-      		lpl += carry;
+		rl = sum[(i+sofst) * threads + thread];
-      		carry = (lpl < carry) + hpl;
+		lpl = rl + lpl;
-
+		carry += lpl < rl;
-      		rl = sum[(i+sofst) * threads + thread];
+		sum[(i+sofst)*threads + thread] = lpl;
-      		lpl = rl + lpl;
+	}
      		carry += lpl < rl;
      		sum[(i+sofst)*threads + thread] = lpl;
    	}
-  	return carry;
+	return carry;
 }
-t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){
+__device__
 t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
 {
 	t4_t carry;
 	uint32_t i;
 	t4_t ul,lpl,hpl,rl;
 	T4_set(&carry,0);
-	for(i=0; i < xsz; i++){
+	for(i=0; i < xsz; i++)
-
+	{
-      		ul = T4(thread,threads,i,x);
+		ul = T4(thread,threads,i,x);
-      		umul_ppmmT4 (&hpl, &lpl, ul, a);
+		umul_ppmmT4 (&hpl, &lpl, ul, a);
-
+
-      		lpl = T4_add(lpl,carry);
+		lpl = T4_add(lpl,carry);
-      		carry = T4_add(T4_lt(lpl,carry), hpl);
+		carry = T4_add(T4_lt(lpl,carry), hpl);
-
+
-      		rl = T4(thread,threads,i+sofst,sum);
+		rl = T4(thread,threads,i+sofst,sum);
-      		lpl = T4_add(rl,lpl);
+		lpl = T4_add(rl,lpl);
-      		carry = T4_add(T4_lt(lpl,rl),carry);
+		carry = T4_add(T4_lt(lpl,rl),carry);
-      		T4_store(thread,threads,i+sofst,sum,lpl);
+		T4_store(thread,threads,i+sofst,sum,lpl);
-    	}
+	}
-  	return carry;
+	return carry;
 }
-
+__global__
-
+void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
+	if (thread < threads)
-    {
+	{
-	if(ulegs < vlegs){
+		if(ulegs < vlegs) {
-		uint64_t t1=ulegs;
+			uint64_t t1=ulegs;
-		ulegs = vlegs;
+			ulegs = vlegs;
-		vlegs = t1;
+			vlegs = t1;
-
+
-		uint64_t *t2 = g_u;
+			uint64_t *t2 = g_u;
-		g_u = g_v;
+			g_u = g_v;
-		g_v = t2;
+			g_v = t2;
-	}
+		}
-
+
-	uint32_t vofst=1,rofst=1,psize=0;
+		uint32_t vofst=1,rofst=1,psize=0;
-	mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
+		mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
 #if 1
-
+		while (vofst < vlegs) {
  	while (vofst < vlegs) {
 		//clear high word //TODO: right
-	//	printf("Size: %d\n", rp->size[tid]);
+	//  printf("Size: %d\n", rp->size[tid]);
-	    	g_p[(psize+0)*threads+thread] = 0;
+			g_p[(psize+0)*threads + thread] = 0;
            	g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs,  g_v[vofst*threads+thread]);
 	    	vofst++; rofst++;
 	    	psize++;
        }
-//	if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
+			g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs,  g_v[vofst*threads+thread]);
 //		rp->size[tid]++;
 			vofst++; rofst++;
 			psize++;
 		}
 //  if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
 //    rp->size[tid]++;
 #endif
-    }
+	}
 }
-__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
+__global__
 void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
+	if (thread < threads)
-    {
+	{
-
+		if(ulegs < vlegs) {  // everything written the other way around... are you kidding me ?!
-	if(ulegs < vlegs){  ///everything written the other way around... are you kidding me ?!
+			uint64_t t1=ulegs;
-		uint64_t t1=ulegs;
+			ulegs = vlegs;
-		ulegs = vlegs;
+			vlegs = t1;
 		vlegs = t1;
 		uint64_t *t2 = g_u;
 		g_u = g_v;
 		g_v = t2;
 	}
-	ulegs >>= 1; vlegs >>= 1;
+			uint64_t *t2 = g_u;
 			g_u = g_v;
 			g_v = t2;
 		}
-	if(thread == 0){
+		ulegs >>= 1; vlegs >>= 1;
 	//    cuPrintf("U: %d V: %d\n", ulegs, vlegs);
 	}
 		if(thread == 0) {
 		//    cuPrintf("U: %d V: %d\n", ulegs, vlegs);
 		}
-
+		uint32_t vofst=1,rofst=1,psize=0;
-	uint32_t vofst=1,rofst=1,psize=0;
+		mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
 	mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
 #if 1
-	t4_t zero;
+		t4_t zero;
-	T4_set(&zero,0);
+		T4_set(&zero,0);
 //    while (vofst < vlegs) {
-//  	while (vofst < vlegs) {
+		#pragma unroll
-
+		for (vofst=1;vofst<vlegs;vofst++)
-#pragma unroll
+		{
-	    for (vofst=1;vofst<vlegs;vofst++) {
+			T4_store(thread,threads,psize,g_p,zero);
-	    	T4_store(thread,threads,psize,g_p,zero);
+			T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
-
+			// vofst++;
            	T4_store(thread,threads,ulegs+rofst,g_p,addmul_1gT4 (thread, threads, g_p ,rofst , g_u, ulegs,T4(thread,threads,vofst,g_v)));
 //	    	vofst++;
 			rofst++;
-	    	psize++;
+			psize++;
-        }
+		}
 #endif
-    }
+	}
 }
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
 {
-   if (code != cudaSuccess)
+	if (code != cudaSuccess)
-   {
+	{
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
+		if (abort) exit(code);
-   }
+	}
 }
-
+__host__
-__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
+void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
 {
 	const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	size_t shared_size =0;
+	size_t shared_size = 0;
  	gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 	gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 }
-__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
+__host__
 void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
 {
 	const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	size_t shared_size =0;
+	size_t shared_size = 0;
-  	//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
+
 	//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 	gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
 }
-__host__ void mul_init(){
+__host__
 void mul_init()
 {
 }
--- a/m7/cuda_ripemd160.cu
+++ b/m7/cuda_ripemd160.cu
@ -48,7 +48,7 @@
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
- __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 static __constant__ uint32_t gpu_IV[5];
 static __constant__ uint32_t bufo[5];
 static const uint32_t IV[5] = {
@ -282,118 +282,116 @@ static const uint32_t IV[5] = {
 		(h)[0] = tmp; \
 	}
-
+__global__
-__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 		union {
 			uint8_t h1[64];
 			uint32_t h4[16];
 			uint64_t h8[8];
 		} hash;
 		#undef F1
 		#undef F2
 		#undef F3
 		#undef F4
 		#undef F5
 		#define F1(x, y, z)   xor3(x,y,z)
 		#define F2(x, y, z)   xandx(x,y,z)
 		#define F3(x, y, z)   xornot64(x,y,z)
 		#define F4(x, y, z)   xandx(z,x,y)
 		#define F5(x, y, z)   xornt64(x,y,z)
 		uint32_t in2[16],in3[16];
 		uint32_t in[16],buf[5];
 		#pragma unroll 16
 		for (int i=0;i<16;i++) {
 			if ((i+16) < 29)
 				in2[i] = c_PaddedMessage80[i+16];
 			else if ((i+16)==29)
 				in2[i] = nounce;
 			else if ((i+16)==30)
 				in2[i] = c_PaddedMessage80[i+16];
 			else
 				in2[i] = 0;
 		}
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        uint32_t nounce = startNounce + thread ;
 union {
 uint8_t h1[64];
 uint32_t h4[16];
 uint64_t h8[8];
 } hash;
 #undef F1
 #undef F2
 #undef F3
 #undef F4
 #undef F5
 #define F1(x, y, z)   xor3(x,y,z)
 #define F2(x, y, z)   xandx(x,y,z)
 #define F3(x, y, z)   xornot64(x,y,z)
 #define F4(x, y, z)   xandx(z,x,y)
 #define F5(x, y, z)   xornt64(x,y,z)
        uint32_t in2[16],in3[16];
        uint32_t in[16],buf[5];
 //	    #pragma unroll 16
 //		for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
        #pragma unroll 16
        for (int i=0;i<16;i++) {if ((i+16)<29)  {in2[i]= c_PaddedMessage80[i+16];}
 						   else if ((i+16)==29) {in2[i]= nounce;}
 						   else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];}
 						   else                 {in2[i]= 0;}}
 		#pragma unroll 16
-		for (int i=0;i<16;i++) {in3[i]=0;}
+		for (int i=0;i<16;i++)
-		                        in3[14]=0x3d0;
+			in3[i]=0;
-//		#pragma unroll 5
+		in3[14]=0x3d0;
-//		for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
+
-         #pragma unroll 5
+		#pragma unroll 5
-		 for (int i=0;i<5;i++) {buf[i]=bufo[i];}
+		for (int i=0;i<5;i++)
-//		 RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
+		 	buf[i]=bufo[i];
-		 RIPEMD160_ROUND_BODY(in2, buf);
+
-         RIPEMD160_ROUND_BODY(in3, buf);
+		RIPEMD160_ROUND_BODY(in2, buf);
-
+		RIPEMD160_ROUND_BODY(in3, buf);
-
+
-hash.h4[5]=0;
+		hash.h4[5]=0;
-#pragma unroll 5
+		#pragma unroll 5
-for (int i=0;i<5;i++)
+		for (int i=0; i<5; i++)
-{hash.h4[i]=buf[i];
+			hash.h4[i]=buf[i];
-}
+
-//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		#pragma unroll 3
-//#pragma unroll 3
+		for (int i=0;i<3;i++) {
-//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];}
+			outputHash[i*threads+thread] = hash.h8[i];
-#pragma unroll 3
+		}
-for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];}
+	}
 //#pragma unroll 8
 //for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
 }
 }
 void ripemd160_cpu_init(int thr_id, int threads)
 {
-
+	cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
 }
-__host__ void ripemd160_setBlock_120(void *pdata)
+__host__
 void ripemd160_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x80;
 	memcpy(PaddedMessage, pdata, 122);
 	memset(PaddedMessage+122,ending,1);
 	memset(PaddedMessage+123, 0, 5); //useless
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 	#undef F1
 	#undef F2
 	#undef F3
 	#undef F4
 	#undef F5
 	#define F1(x, y, z)   ((x) ^ (y) ^ (z))
 	#define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
 	#define F3(x, y, z)   (((x) | ~(y)) ^ (z))
 	#define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 	#define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 #undef F1
 #undef F2
 #undef F3
 #undef F4
 #undef F5
 #define F1(x, y, z)   ((x) ^ (y) ^ (z))
 #define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
 #define F3(x, y, z)   (((x) | ~(y)) ^ (z))
 #define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 #define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 	uint32_t* alt_data =(uint32_t*)pdata;
-        uint32_t in[16],buf[5];
+	uint32_t in[16],buf[5];
 	for (int i=0;i<16;i++)
 		in[i]= alt_data[i];
-		for (int i=0;i<16;i++) {in[i]= alt_data[i];}
+	for (int i=0;i<5;i++)
 		buf[i]=IV[i];
-
+	RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
 		for (int i=0;i<5;i++) {buf[i]=IV[i];}
 		 RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
 	cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
-__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+__host__
 void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 256;
-	const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-
+	dim3 block(threadsperblock);
 dim3 grid((threads + threadsperblock-1)/threadsperblock);
 dim3 block(threadsperblock);
 //dim3 grid(1);
 //dim3 block(1);
 	size_t shared_size =0;
 	m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
--- a/m7/cuda_tiger192.cu
+++ b/m7/cuda_tiger192.cu
@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint64_t bufo[3];
 static __constant__ uint64_t gpu_III[3];
 static __constant__ uint64_t T1[256];
 static __constant__ uint64_t T2[256];
 static __constant__ uint64_t T3[256];
 static __constant__ uint64_t T4[256];
 static const uint64_t III[3] = {
 	SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
 };
@ -583,16 +585,16 @@ static const uint64_t cpu_T4[256] = {
 	SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F)
 };
-#define PASS(a, b, c, mul)    { \
+#define PASS(a, b, c, mul) { \
-		ROUND(a, b, c, X0, mul); \
+	ROUND(a, b, c, X0, mul); \
-		ROUND(b, c, a, X1, mul); \
+	ROUND(b, c, a, X1, mul); \
-		ROUND(c, a, b, X2, mul); \
+	ROUND(c, a, b, X2, mul); \
-		ROUND(a, b, c, X3, mul); \
+	ROUND(a, b, c, X3, mul); \
-		ROUND(b, c, a, X4, mul); \
+	ROUND(b, c, a, X4, mul); \
-		ROUND(c, a, b, X5, mul); \
+	ROUND(c, a, b, X5, mul); \
-		ROUND(a, b, c, X6, mul); \
+	ROUND(a, b, c, X6, mul); \
-		ROUND(b, c, a, X7, mul); \
+	ROUND(b, c, a, X7, mul); \
-	}
+}
 #define MUL5(x)   SPH_T64((x) * SPH_C64(5))
 #define MUL7(x)   SPH_T64((x) * SPH_C64(7))
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
 		(r)[2] = SPH_T64(C + (r)[2]); \
 	}
-
+__global__
-__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	__shared__ uint64_t sharedMem[1024];
-	 __shared__ uint64_t sharedMem[1024];
+	if(threadIdx.x < 256) {
 	if(threadIdx.x < 256)
 	{
 		sharedMem[threadIdx.x]      = T1[threadIdx.x];
 		sharedMem[threadIdx.x+256]  = T2[threadIdx.x];
 		sharedMem[threadIdx.x+512]  = T3[threadIdx.x];
 		sharedMem[threadIdx.x+768]  = T4[threadIdx.x];
 	}
 	__syncthreads();
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 union {
 uint8_t h1[64];
 uint32_t h4[16];
 uint64_t h8[8];
 } hash;
 /*
 #undef MUL5
 #undef MUL7
@ -680,7 +677,7 @@ uint64_t h8[8];
 #define MUL7(x)   mul(x,7)
 #define MUL9(x)   mul(x,9)
 */
-#define PASS(a, b, c, mul)    { \
+#define PASS(a, b, c, mul) { \
 		ROUND(a, b, c, X0, mul); \
 		ROUND(b, c, a, X1, mul); \
 		ROUND(c, a, b, X2, mul); \
@ -691,89 +688,95 @@ uint64_t h8[8];
 		ROUND(b, c, a, X7, mul); \
 	}
-
+#define ROUND(a, b, c, x, mul) { \
 #define ROUND(a, b, c, x, mul)    { \
 		c ^= x; \
 		a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
-				  ^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \
+			^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \
 		b = SPH_T64(b + (sharedMem[((c >> 8) & 0xFF)+768] ^ sharedMem[((c >> 24) & 0xFF)+512] \
 			^ sharedMem[((c >> 40) & 0xFF)+256] ^ sharedMem[(c >> 56) & 0xFF])); \
 		b = mul(b); \
 	}
-
+		uint64_t in2[8];
 		uint64_t in[8],buf[3];
 		uint64_t in2[8],in3[8];
 		#pragma unroll 8
-		for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];}
+		for (int i=0; i<8; i++)
 			in2[i] = c_PaddedMessage80[i+8];
 		uint32_t* Mess = (uint32_t*)in2;
-		Mess[13]=nounce;
+		Mess[13] = nounce;
 		uint64_t in3[8];
 		#pragma unroll 8
-		for (int i=0;i<8;i++) {in3[i]=0;}
+		for (int i=0; i<8; i++)
 			in3[i]=0;
 		in3[7]=0x3d0;
 		#pragma unroll 3
-		for (int i=0;i<3;i++) {buf[i]=bufo[i];}
+		uint64_t buf[3];
 		#pragma unroll 3
 		for (int i=0; i<3; i++)
 			buf[i]=bufo[i];
-		 TIGER_ROUND_BODY(in2, buf);
+		TIGER_ROUND_BODY(in2, buf);
-		 TIGER_ROUND_BODY(in3, buf);
+		TIGER_ROUND_BODY(in3, buf);
-#pragma unroll 8
+		#pragma unroll 8
-for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}}
+		for (int i=0;i<8;i++) {
- } //// threads
+			if (i<3) {
 				outputHash[i*threads+thread] = buf[i];
 			} else {
 				outputHash[i*threads+thread] = 0;
 			}
 		}
 	} // thread
 }
-
+__host__
 void tiger192_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
 }
-__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+__host__
 void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 640; // 256
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+	size_t shared_size = 0;
 //	const int threadsperblock = 256;
 dim3 grid((threads + threadsperblock-1)/threadsperblock);
 dim3 block(threadsperblock);
 //dim3 grid(1);
 //dim3 block(1);
 	size_t shared_size =0;
 	m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-
+__host__
-__host__ void tiger192_setBlock_120(void *pdata)
+void tiger192_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x01;
 	memcpy(PaddedMessage, pdata, 122);
 	memset(PaddedMessage+122,ending,1);
 	memset(PaddedMessage+123, 0, 5); //useless
 	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-#undef ROUND
+	#undef ROUND
-#undef MUL5
+	#undef MUL5
-#undef MUL7
+	#undef MUL7
-#undef MUL9
+	#undef MUL9
-#define MUL5(x)   ((x) * SPH_C64(5))
+	#define MUL5(x)   ((x) * SPH_C64(5))
-#define MUL7(x)   ((x) * SPH_C64(7))
+	#define MUL7(x)   ((x) * SPH_C64(7))
-#define MUL9(x)   ((x) * SPH_C64(9))
+	#define MUL9(x)   ((x) * SPH_C64(9))
-#define ROUND(a, b, c, x, mul)    { \
+	#define ROUND(a, b, c, x, mul) { \
 		c ^= x; \
 		a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \
 				  ^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
 		b = mul(b); \
 	}
 	uint64_t* alt_data = (uint64_t*) pdata;
-		uint64_t in[8],buf[3];
+	uint64_t in[8],buf[3];
-		for (int i=0;i<8;i++) {in[i]= alt_data[i];}
+
-		for (int i=0;i<3;i++) {buf[i]=III[i];}
+	for (int i=0;i<8;i++)
 		in[i] = alt_data[i];
-		 TIGER_ROUND_BODY(in, buf)
+	for (int i=0;i<3;i++)
-	cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+		buf[i] = III[i];
 	TIGER_ROUND_BODY(in, buf)
 	cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
--- a/m7/m7.cu
+++ b/m7/m7.cu
@ -22,7 +22,7 @@ extern "C"
 extern int device_map[8];
 extern bool opt_benchmark;
-static uint64_t *d_hash[8];
+//static uint64_t *d_hash[8];
 static uint64_t *FinalHash[8];
 static uint64_t *KeccakH[8];
 static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
 {
 	// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
 	char data_str[245], hash_str[65], target_str[65];
 	uint8_t *bdata = 0;
 	mpz_t bns[7];
 	mpz_t product;
 	int rc = 0;
 	for(int i=0; i < 7; i++) {
 		mpz_init(bns[i]);
@ -292,44 +290,42 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
 	tiger192_setBlock_120((void*)pdata);
 	cuda_check_cpu_setTarget(ptarget);
 	uint32_t TheNonce = pdata[29];
 	do {
 		int order = 0;
 		uint32_t foundNonce;
-		  m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);
+		m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);
-		  m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);
+		m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);
-		   m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);
+		m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);
 		m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++);
 		m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++);
-	   m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);
+		m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);
-	m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);
+		m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);
-	cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
+		cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
-	MyStreamSynchronize(0,order++,thr_id);
+		MyStreamSynchronize(0,order++,thr_id);
-	cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
+		cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
-	MyStreamSynchronize(0,order++,thr_id);
+		MyStreamSynchronize(0,order++,thr_id);
-	cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
+		cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
-	MyStreamSynchronize(0,order++,thr_id);
+		MyStreamSynchronize(0,order++,thr_id);
-	cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);  //112
+		cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);  //112
-	MyStreamSynchronize(0,order++,thr_id);
+		MyStreamSynchronize(0,order++,thr_id);
-	m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
+		m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
-	MyStreamSynchronize(0,order++,thr_id);
+		MyStreamSynchronize(0,order++,thr_id);
-	m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
+		m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
-
+		MyStreamSynchronize(0,order++,thr_id);
 	MyStreamSynchronize(0,order++,thr_id);
 		foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);
 		if (foundNonce != 0xffffffff)
--- a/m7/m7_keccak512.cu
+++ b/m7/m7_keccak512.cu
@ -5,6 +5,8 @@
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 static __constant__ uint64_t stateo[25];
 static __constant__ uint64_t RC[24];
 static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
 	0x0000000080000001ull, 0x8000000080008008ull
 };
-static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
+__device__ __forceinline__
 static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
 	}
 }
-
+__host__ __forceinline__
-static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) {
+static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
@ -204,25 +209,18 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
 	}
 }
-
+__global__ /* __launch_bounds__(256, 2) */
-
+void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
-		 uint64_t state[25];
+		uint64_t state[25];
 		#pragma unroll 16
-		 for (int i=9;i<25;i++) {state[i]=stateo[i];}
+		for (int i=9;i<25;i++) {state[i]=stateo[i];}
 		state[0] = xor1(stateo[0],c_PaddedMessage80[9]);
 		state[1] = xor1(stateo[1],c_PaddedMessage80[10]);
@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
 		keccak_block(state,RC);
-#pragma unroll 8
+		#pragma unroll 8
-for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];}
+		for (int i=0;i<8;i++) {
-
+			outputHash[i*threads+thread] = state[i];
-
+		}
 	} //thread
 }
 void m7_keccak512_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
 }
 __host__ void m7_keccak512_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x01;
 	memcpy(PaddedMessage, pdata, 122);
 	memset(PaddedMessage+122,ending,1);
 	memset(PaddedMessage+123, 0, 5);
 	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 	uint64_t* alt_data = (uint64_t*) pdata;
-		 uint64_t state[25];
+	uint64_t state[25];
-		 for(int i=0;i<25;i++) {state[i]=0;}
+	for(int i=0;i<9;i++)
-
+		state[i] = alt_data[i];
-
+	for(int i=10;i<25;i++)
-		for (int i=0;i<9;i++) {state[i]  ^= alt_data[i];}
+		state[i] = 0;
-		keccak_block_host(state,cpu_RC);
+	keccak_block_host(state,cpu_RC);
-
+
-		cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }