m7: forgot to indent some cuda files

and remove unused variables...
10 years ago · fcd381cda2
6 changed files with 679 additions and 725 deletions
--- a/m7/cuda_m7_sha256.cu
+++ b/m7/cuda_m7_sha256.cu
@ -1,10 +1,10 @@
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 #include "sph/sph_types.h"
 #include "cuda_helper.h"
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 //#define SPH_C64(x)    ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
 };
-static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x)
+__device__ __forceinline__
 static uint32_t bsg2_0(uint32_t x)
 {
 	uint32_t r1 = SPH_ROTR32(x,2);
 	uint32_t r2 = SPH_ROTR32(x,13);
 	uint32_t r3 = SPH_ROTR32(x,22);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t bsg2_1(uint32_t x)
 {
 	uint32_t r1 = SPH_ROTR32(x,6);
 	uint32_t r2 = SPH_ROTR32(x,11);
 	uint32_t r3 = SPH_ROTR32(x,25);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t ssg2_0(uint32_t x)
 {
 	uint64_t r1 = SPH_ROTR32(x,7);
 	uint64_t r2 = SPH_ROTR32(x,18);
 	uint64_t r3 = shr_t32(x,3);
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
+
 __device__ __forceinline__
 static uint32_t ssg2_1(uint32_t x)
 {
 	uint64_t r1 = SPH_ROTR32(x,17);
 	uint64_t r2 = SPH_ROTR32(x,19);
@ -96,7 +103,8 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
 	return xor3b(r1,r2,r3);
 }
-static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__device__ __forceinline__
 static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                        uint32_t in,const uint32_t Kshared)
 {
 	uint32_t t1,t2;
@ -111,12 +119,10 @@ d = d + t1;
 	h = t1 + t2;
 }
-static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__host__ __forceinline__
 static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                            uint32_t in,const uint32_t Kshared)
 {
 	uint32_t t1,t2;
 	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
 	uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
@ -129,7 +135,8 @@ d = d + t1;
 	h = t1 + t2;
 }
-static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__device__ __forceinline__
 static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                        uint32_t* in,uint32_t pc,const uint32_t Kshared)
 {
 	uint32_t t1,t2;
@ -156,10 +163,10 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc];
 	t2 = bsg20 + andorv;
 	d =  d + t1;
 	h = t1 + t2;
 }
-static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
+__host__ __forceinline__
 static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
                            uint32_t* in,uint32_t pc,const uint32_t Kshared)
 {
 	uint32_t t1,t2;
@ -172,7 +179,6 @@ uint32_t inx1 = in[pcidx1];
 	uint32_t inx2 = in[pcidx2];
 	uint32_t inx3 = in[pcidx3];
 	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
 	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
 	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
@ -186,14 +192,12 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc];
 	t2 = bsg20 + andorv;
 	d =  d + t1;
 	h = t1 + t2;
 }
-static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
+__device__ __forceinline__
 static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
 {
 	uint32_t a=r[0];
 	uint32_t b=r[1];
 	uint32_t c=r[2];
@ -221,8 +225,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
 	sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
 	#pragma unroll 3
-		for (int i=0;i<3;i++) {
+	for (int i=0;i<3;i++)
-
+	{
 		sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
 		sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
 		sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,11 +243,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
 		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
 	r[0] = r[0] + a;
 	r[1] = r[1] + b;
 	r[2] = r[2] + c;
@ -254,10 +255,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
 	r[7] = r[7] + h;
 }
-static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
+__forceinline__
 static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
 {
 	uint32_t a=r[0];
 	uint32_t b=r[1];
 	uint32_t c=r[2];
@ -285,8 +285,8 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
 	sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
 	#pragma unroll 3
-		for (int i=0;i<3;i++) {
+	for (int i=0;i<3;i++)
-
+	{
 		sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
 		sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
 		sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,7 +303,6 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
 		sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
 	r[0] = r[0] + a;
@ -316,27 +315,12 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
 	r[7] = r[7] + h;
 }
-
+__global__
-__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 /*
 	__shared__ uint32_t Kshared[64];
 	if (threadIdx.x < 64) {
 		Kshared[threadIdx.x]=K[threadIdx.x];
 	}
 	__syncthreads();
 */
 union {
 uint8_t h1[64];
 uint32_t h4[16];
 uint64_t h8[8];
 } hash;
 //uint32_t buf[8];
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce +  thread ; // original implementation
 		uint32_t buf[8];
@ -344,56 +328,36 @@ uint64_t h8[8];
 		uint32_t in3[16]={0};
 		#pragma unroll 13
-		for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);}
+		for (int i=0; i<13; i++)
 			in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
 		in2[13]=cuda_swab32(nounce);
 		in2[14]=cuda_swab32(c_PaddedMessage80[30]);
 		in3[15]=0x3d0;
 		#pragma unroll 8
-		for (int i=0;i<8;i++) {buf[i]= pbuf[i];}
+		for (int i=0; i<8; i++)
 			buf[i] = pbuf[i];
 		sha2_round_body(in2,buf,K);
 		sha2_round_body(in3,buf,K);
 //#pragma unroll 8
 //for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
 		#pragma unroll 4
-for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
+		for (int i=0; i<4; i++) {
-
+			outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
-
+		}
-//////////////////////////////////////////////////////////////////////////////////////////////////
+	} // thread
 	} // threads
 }
-
+__global__
-__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
+void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
 {
 /*
 	__shared__ uint32_t Kshared[64];
 	if (threadIdx.x < 64) {
 		Kshared[threadIdx.x]=K[threadIdx.x];
 	}
 	__syncthreads();
 */
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 union {
 uint8_t h1[304];
 uint32_t h4[76];
 uint64_t h8[38];
 } hash;
 		uint32_t in[16],buf[8];
 		#pragma unroll 8
 		for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
 		#pragma unroll 8
@ -415,20 +379,22 @@ uint64_t h8[38];
 		#pragma unroll 5
 		for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
 		((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
 		in[11]=0;
 		in[12]=0;
 		in[13]=0;
 		in[14]=0;
 		in[15]=0x968;
 		int it=0;
 		do {
 			in[15]-=8;
 			it++;
 		}  while (((uint8_t*)in)[44-it]==0);
 		((uint8_t*)in)[44-it+1]=0x80;
 		((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
@ -438,73 +404,62 @@ uint64_t h8[38];
 		uint32_t nounce = startNounce +thread;
 		bool rc = false;
 		#pragma unroll 4
 		for (int i = 0; i < 4; i++)
 		{
 			if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
-				if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc =  false;}
+				if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
 					rc = true;
 				else
 					rc = false;
 				//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
 			}
 		}
-
+		if (rc && resNounce[0] > nounce)
 		if(rc == true)
 		{
 			if(resNounce[0] > nounce)
 			resNounce[0] = nounce;
-
+	} // thread
 		}
 ////
 	} // threads
 }
-
+__host__
-
+void m7_sha256_cpu_init(int thr_id, int threads)
 __host__ void m7_sha256_cpu_init(int thr_id, int threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol(	H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
 	cudaMemcpyToSymbol(	K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
 	cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
 	cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
 }
-
+__host__
-__host__  uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
+uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
 {
-
+	const int threadsperblock = 384;
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
 	cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
 	cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	MyStreamSynchronize(NULL, order, thr_id);
 	result = *d_mnounce[thr_id];
 	return result;
 }
-
+__host__
-__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 512;
 	const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-//	dim3 grid(1);
+
 //	dim3 block(1);
 	size_t shared_size = 0;
 	m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget)  //not useful
+__host__
 void m7_sha256_setBlock_120(void *pdata,const void *ptarget)  //not useful
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x80;
--- a/m7/cuda_mul2.cu
+++ b/m7/cuda_mul2.cu
@ -85,7 +85,8 @@ ulonglong2 umul64wide (unsigned long long int a,
 }
-__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
+__device__ __forceinline__
 void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
 {
 	asm ("{\n\t"
 		".reg .u32 o0, o1, o2, o3, o4;    \n\t"
@ -187,7 +188,8 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
 #endif
-__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
+__device__ __forceinline__
 t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
 	t4_t ret;
 	ret.high = g[(idx*2 + 1)*threads + thread];
 	ret.low = g[(idx*2)*threads + thread];
@ -199,7 +201,8 @@ __device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t i
 	return ret;
 }
-__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
+__device__ __forceinline__
 void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
 	g[(idx*2 + 1)*threads + thread]=val.high;
 	g[(idx*2)*threads + thread]=val.low;
@ -209,12 +212,14 @@ __device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint
 }
-__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){
+__device__ __forceinline__
 void T4_set(t4_t *d, uint64_t v){
 	d->high = 0;
 	d->low = v;
 }
-__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
+__device__ __forceinline__
 t4_t T4_add(t4_t a, t4_t b){
 	t4_t ret;
 	uint32_t c=0;
 	ret.low = a.low + b.low;
@ -224,7 +229,8 @@ __device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
 	return ret;
 }
-__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
+__device__ __forceinline__
 t4_t T4_add(uint64_t a, t4_t b){
 	t4_t ret;
 	uint32_t c=0;
 	ret.low = a + b.low;
@ -234,8 +240,8 @@ __device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
 	return ret;
 }
-
+__device__ __forceinline__
-__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
+uint32_t T4_lt(t4_t a, t4_t b){
 	if(a.high < b.high)
 		return 1;
 	if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
 	return 0;
 }
-__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
+__device__ __forceinline__
 uint32_t T4_gt(t4_t a, uint64_t b){
 	if(a.high)
 		return 1;
 	if(a.low > b)
@ -252,7 +259,8 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
 }
-__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
+__device__
 void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
 	t4_t ul, cl, hpl, lpl;
 	uint32_t i;
 	T4_set(&cl,0);
@ -289,13 +297,14 @@ __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint6
 	*size = len + (cl != 0);
 }
-uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){
+uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
 {
 	uint64_t carry=0;
 	uint32_t i;
 	uint64_t ul,lpl,hpl,rl;
-	for(i=0; i < xsz; i++){
+	for(i=0; i < xsz; i++)
-
+	{
 		ul = x[i*threads + thread];
 		umul_ppmm (hpl, lpl, ul, a);
@ -311,13 +320,15 @@ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum,
 	return carry;
 }
-t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){
+__device__
 t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
 {
 	t4_t carry;
 	uint32_t i;
 	t4_t ul,lpl,hpl,rl;
 	T4_set(&carry,0);
-	for(i=0; i < xsz; i++){
+	for(i=0; i < xsz; i++)
-
+	{
 		ul = T4(thread,threads,i,x);
 		umul_ppmmT4 (&hpl, &lpl, ul, a);
@ -333,9 +344,8 @@ t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, u
 	return carry;
 }
-
+__global__
-
+void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -354,7 +364,6 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
 		mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
 #if 1
 		while (vofst < vlegs) {
 		//clear high word //TODO: right
 	//  printf("Size: %d\n", rp->size[tid]);
@ -368,19 +377,17 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
 //  if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
 //    rp->size[tid]++;
 #endif
 	}
 }
-__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
+__global__
 void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-
+		if(ulegs < vlegs) {  // everything written the other way around... are you kidding me ?!
 	if(ulegs < vlegs){  ///everything written the other way around... are you kidding me ?!
 			uint64_t t1=ulegs;
 			ulegs = vlegs;
 			vlegs = t1;
@ -396,8 +403,6 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
 		//    cuPrintf("U: %d V: %d\n", ulegs, vlegs);
 		}
 		uint32_t vofst=1,rofst=1,psize=0;
 		mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
@ -405,21 +410,17 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
 		t4_t zero;
 		T4_set(&zero,0);
 //    while (vofst < vlegs) {
 		#pragma unroll
-	    for (vofst=1;vofst<vlegs;vofst++) {
+		for (vofst=1;vofst<vlegs;vofst++)
 		{
 			T4_store(thread,threads,psize,g_p,zero);
 			T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
 			// vofst++;
 			rofst++;
 			psize++;
 		}
 #endif
 	}
 }
@ -434,10 +435,9 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
 	}
 }
-
+__host__
-__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
+void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
 {
 	const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
 	// berechne wie viele Thread Blocks wir brauchen
@ -445,13 +445,13 @@ __host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, u
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
  	gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 	gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 }
-__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
+__host__
 void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
 {
 	const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
 	// berechne wie viele Thread Blocks wir brauchen
@ -459,10 +459,13 @@ __host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs,
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
 	gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
 }
-__host__ void mul_init(){
+__host__
 void mul_init()
 {
 }
--- a/m7/cuda_ripemd160.cu
+++ b/m7/cuda_ripemd160.cu
@ -282,14 +282,12 @@ static const uint32_t IV[5] = {
 		(h)[0] = tmp; \
 	}
-
+__global__
-__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 		union {
 			uint8_t h1[64];
@ -308,51 +306,52 @@ uint64_t h8[8];
 		#define F3(x, y, z)   xornot64(x,y,z)
 		#define F4(x, y, z)   xandx(z,x,y)
 		#define F5(x, y, z)   xornt64(x,y,z)
 		uint32_t in2[16],in3[16];
 		uint32_t in[16],buf[5];
 //	    #pragma unroll 16
 //		for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
 		#pragma unroll 16
-        for (int i=0;i<16;i++) {if ((i+16)<29)  {in2[i]= c_PaddedMessage80[i+16];}
+		for (int i=0;i<16;i++) {
-						   else if ((i+16)==29) {in2[i]= nounce;}
+			if ((i+16) < 29)
-						   else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];}
+				in2[i] = c_PaddedMessage80[i+16];
-						   else                 {in2[i]= 0;}}
+			else if ((i+16)==29)
 				in2[i] = nounce;
 			else if ((i+16)==30)
 				in2[i] = c_PaddedMessage80[i+16];
 			else
 				in2[i] = 0;
 		}
 		#pragma unroll 16
-		for (int i=0;i<16;i++) {in3[i]=0;}
+		for (int i=0;i<16;i++)
 			in3[i]=0;
 		in3[14]=0x3d0;
-//		#pragma unroll 5
+
 //		for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
 		#pragma unroll 5
-		 for (int i=0;i<5;i++) {buf[i]=bufo[i];}
+		for (int i=0;i<5;i++)
-//		 RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
+		 	buf[i]=bufo[i];
 		RIPEMD160_ROUND_BODY(in2, buf);
 		RIPEMD160_ROUND_BODY(in3, buf);
 		hash.h4[5]=0;
 		#pragma unroll 5
 		for (int i=0; i<5; i++)
-{hash.h4[i]=buf[i];
+			hash.h4[i]=buf[i];
-}
+
 //uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
 //#pragma unroll 3
 //for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];}
 		#pragma unroll 3
-for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];}
+		for (int i=0;i<3;i++) {
-//#pragma unroll 8
+			outputHash[i*threads+thread] = hash.h8[i];
-//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
+		}
 	}
 }
 void ripemd160_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
 }
-__host__ void ripemd160_setBlock_120(void *pdata)
+__host__
 void ripemd160_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x80;
@ -371,29 +370,28 @@ __host__ void ripemd160_setBlock_120(void *pdata)
 	#define F3(x, y, z)   (((x) | ~(y)) ^ (z))
 	#define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 	#define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 	uint32_t* alt_data =(uint32_t*)pdata;
 	uint32_t in[16],buf[5];
 	for (int i=0;i<16;i++)
 		in[i]= alt_data[i];
-		for (int i=0;i<16;i++) {in[i]= alt_data[i];}
+	for (int i=0;i<5;i++)
-
+		buf[i]=IV[i];
 		for (int i=0;i<5;i++) {buf[i]=IV[i];}
 	RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
 	cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
-__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+__host__
 void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
-
+	const int threadsperblock = 256;
 	const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-//dim3 grid(1);
+
 //dim3 block(1);
 	size_t shared_size =0;
 	m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
--- a/m7/cuda_tiger192.cu
+++ b/m7/cuda_tiger192.cu
@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint64_t bufo[3];
 static __constant__ uint64_t gpu_III[3];
 static __constant__ uint64_t T1[256];
 static __constant__ uint64_t T2[256];
 static __constant__ uint64_t T3[256];
 static __constant__ uint64_t T4[256];
 static const uint64_t III[3] = {
 	SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
 };
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
 		(r)[2] = SPH_T64(C + (r)[2]); \
 	}
-
+__global__
-__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
+void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	__shared__ uint64_t sharedMem[1024];
-	if(threadIdx.x < 256)
+
-	{
+	if(threadIdx.x < 256) {
 		sharedMem[threadIdx.x]      = T1[threadIdx.x];
 		sharedMem[threadIdx.x+256]  = T2[threadIdx.x];
 		sharedMem[threadIdx.x+512]  = T3[threadIdx.x];
 		sharedMem[threadIdx.x+768]  = T4[threadIdx.x];
 	}
 	__syncthreads();
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 union {
 uint8_t h1[64];
 uint32_t h4[16];
 uint64_t h8[8];
 } hash;
 /*
 #undef MUL5
 #undef MUL7
@ -691,8 +688,6 @@ uint64_t h8[8];
 		ROUND(b, c, a, X7, mul); \
 	}
 #define ROUND(a, b, c, x, mul) { \
 		c ^= x; \
 		a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
@ -702,64 +697,72 @@ uint64_t h8[8];
 		b = mul(b); \
 	}
-
+		uint64_t in2[8];
 		uint64_t in[8],buf[3];
 		uint64_t in2[8],in3[8];
 		#pragma unroll 8
-		for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];}
+		for (int i=0; i<8; i++)
 			in2[i] = c_PaddedMessage80[i+8];
 		uint32_t* Mess = (uint32_t*)in2;
 		Mess[13] = nounce;
 		uint64_t in3[8];
 		#pragma unroll 8
-		for (int i=0;i<8;i++) {in3[i]=0;}
+		for (int i=0; i<8; i++)
 			in3[i]=0;
 		in3[7]=0x3d0;
 		#pragma unroll 3
-		for (int i=0;i<3;i++) {buf[i]=bufo[i];}
+		uint64_t buf[3];
 		#pragma unroll 3
 		for (int i=0; i<3; i++)
 			buf[i]=bufo[i];
 		TIGER_ROUND_BODY(in2, buf);
 		TIGER_ROUND_BODY(in3, buf);
 		#pragma unroll 8
-for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}}
+		for (int i=0;i<8;i++) {
- } //// threads
+			if (i<3) {
 				outputHash[i*threads+thread] = buf[i];
 			} else {
 				outputHash[i*threads+thread] = 0;
 			}
 		}
 	} // thread
 }
-
+__host__
 void tiger192_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
 }
-__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+__host__
 void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
-
+	const int threadsperblock = 640; // 256
 	const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN
 //	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-//dim3 grid(1);
+
 //dim3 block(1);
 	size_t shared_size = 0;
 	m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-
+__host__
-__host__ void tiger192_setBlock_120(void *pdata)
+void tiger192_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x01;
 	memcpy(PaddedMessage, pdata, 122);
 	memset(PaddedMessage+122,ending,1);
 	memset(PaddedMessage+123, 0, 5); //useless
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
 		b = mul(b); \
 	}
 	uint64_t* alt_data = (uint64_t*) pdata;
 	uint64_t in[8],buf[3];
 		for (int i=0;i<8;i++) {in[i]= alt_data[i];}
 		for (int i=0;i<3;i++) {buf[i]=III[i];}
-		 TIGER_ROUND_BODY(in, buf)
+	for (int i=0;i<8;i++)
-	cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+		in[i] = alt_data[i];
 	for (int i=0;i<3;i++)
 		buf[i] = III[i];
 	TIGER_ROUND_BODY(in, buf)
 	cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
--- a/m7/m7.cu
+++ b/m7/m7.cu
@ -22,7 +22,7 @@ extern "C"
 extern int device_map[8];
 extern bool opt_benchmark;
-static uint64_t *d_hash[8];
+//static uint64_t *d_hash[8];
 static uint64_t *FinalHash[8];
 static uint64_t *KeccakH[8];
 static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
 {
 	// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
 	char data_str[245], hash_str[65], target_str[65];
 	uint8_t *bdata = 0;
 	mpz_t bns[7];
 	mpz_t product;
 	int rc = 0;
 	for(int i=0; i < 7; i++) {
 		mpz_init(bns[i]);
@ -292,7 +290,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
 	tiger192_setBlock_120((void*)pdata);
 	cuda_check_cpu_setTarget(ptarget);
 	uint32_t TheNonce = pdata[29];
 	do {
 		int order = 0;
@ -328,7 +325,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
 		MyStreamSynchronize(0,order++,thr_id);
 		m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
 		MyStreamSynchronize(0,order++,thr_id);
 		foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);
--- a/m7/m7_keccak512.cu
+++ b/m7/m7_keccak512.cu
@ -5,6 +5,8 @@
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 static __constant__ uint64_t stateo[25];
 static __constant__ uint64_t RC[24];
 static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
 	0x0000000080000001ull, 0x8000000080008008ull
 };
-static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
+__device__ __forceinline__
 static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
 	}
 }
-
+__host__ __forceinline__
-static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) {
+static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
@ -204,19 +209,12 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
 	}
 }
-
+__global__ /* __launch_bounds__(256, 2) */
-
+void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 		uint64_t state[25];
@ -237,38 +235,36 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
 		keccak_block(state,RC);
 		#pragma unroll 8
-for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];}
+		for (int i=0;i<8;i++) {
-
+			outputHash[i*threads+thread] = state[i];
-
+		}
 	} //thread
 }
 void m7_keccak512_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
 }
 __host__ void m7_keccak512_setBlock_120(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	uint8_t ending =0x01;
 	memcpy(PaddedMessage, pdata, 122);
 	memset(PaddedMessage+122,ending,1);
 	memset(PaddedMessage+123, 0, 5);
 	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 	uint64_t* alt_data = (uint64_t*) pdata;
 	uint64_t state[25];
-		 for(int i=0;i<25;i++) {state[i]=0;}
+	for(int i=0;i<9;i++)
-
+		state[i] = alt_data[i];
-
+	for(int i=10;i<25;i++)
-		for (int i=0;i<9;i++) {state[i]  ^= alt_data[i];}
+		state[i] = 0;
 	keccak_block_host(state,cpu_RC);
 	cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }