diff --git a/m7/cuda_m7_sha256.cu b/m7/cuda_m7_sha256.cu index 46c026a..f151d93 100644 --- a/m7/cuda_m7_sha256.cu +++ b/m7/cuda_m7_sha256.cu @@ -1,10 +1,10 @@ #include #include -#include "cuda_helper.h" - #include "sph/sph_types.h" +#include "cuda_helper.h" + extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); //#define SPH_C64(x) ((uint64_t)(x ## ULL)) @@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = { }; -static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x) +__device__ __forceinline__ +static uint32_t bsg2_0(uint32_t x) { uint32_t r1 = SPH_ROTR32(x,2); uint32_t r2 = SPH_ROTR32(x,13); uint32_t r3 = SPH_ROTR32(x,22); return xor3b(r1,r2,r3); } -static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x) + +__device__ __forceinline__ +static uint32_t bsg2_1(uint32_t x) { uint32_t r1 = SPH_ROTR32(x,6); uint32_t r2 = SPH_ROTR32(x,11); uint32_t r3 = SPH_ROTR32(x,25); return xor3b(r1,r2,r3); } -static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x) + +__device__ __forceinline__ +static uint32_t ssg2_0(uint32_t x) { uint64_t r1 = SPH_ROTR32(x,7); uint64_t r2 = SPH_ROTR32(x,18); uint64_t r3 = shr_t32(x,3); return xor3b(r1,r2,r3); } -static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) + +__device__ __forceinline__ +static uint32_t ssg2_1(uint32_t x) { uint64_t r1 = SPH_ROTR32(x,17); uint64_t r2 = SPH_ROTR32(x,19); @@ -96,133 +103,130 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) return xor3b(r1,r2,r3); } -static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, - uint32_t in,const uint32_t Kshared) +__device__ __forceinline__ +static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t in,const uint32_t Kshared) { -uint32_t t1,t2; -uint32_t vxandx = xandx(e, f, g); -uint32_t bsg21 =bsg2_1(e); -uint32_t bsg20 =bsg2_0(a); -uint32_t andorv =andor32(a,b,c); - -t1 = h + bsg21 + vxandx + Kshared + in; -t2 = bsg20 + andorv; -d = d + t1; -h = t1 + t2; + uint32_t t1,t2; + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 =bsg2_1(e); + uint32_t bsg20 =bsg2_0(a); + uint32_t andorv =andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; } -static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, - uint32_t in,const uint32_t Kshared) +__host__ __forceinline__ +static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t in,const uint32_t Kshared) { - - - -uint32_t t1,t2; -uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); -uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); -uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); -uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); - -t1 = h + bsg21 + vxandx + Kshared + in; -t2 = bsg20 + andorv; -d = d + t1; -h = t1 + t2; + uint32_t t1,t2; + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; } -static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, - uint32_t* in,uint32_t pc,const uint32_t Kshared) +__device__ __forceinline__ +static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t* in,uint32_t pc,const uint32_t Kshared) { -uint32_t t1,t2; - -int pcidx1 = (pc-2) & 0xF; -int pcidx2 = (pc-7) & 0xF; -int pcidx3 = (pc-15) & 0xF; -uint32_t inx0 = in[pc]; -uint32_t inx1 = in[pcidx1]; -uint32_t inx2 = in[pcidx2]; -uint32_t inx3 = in[pcidx3]; - - -uint32_t ssg21 = ssg2_1(inx1); -uint32_t ssg20 = ssg2_0(inx3); -uint32_t vxandx = xandx(e, f, g); -uint32_t bsg21 =bsg2_1(e); -uint32_t bsg20 =bsg2_0(a); -uint32_t andorv =andor32(a,b,c); - -in[pc] = ssg21+inx2+ssg20+inx0; - -t1 = h + bsg21 + vxandx + Kshared + in[pc]; -t2 = bsg20 + andorv; -d = d + t1; -h = t1 + t2; - + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + + uint32_t ssg21 = ssg2_1(inx1); + uint32_t ssg20 = ssg2_0(inx3); + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 =bsg2_1(e); + uint32_t bsg20 =bsg2_0(a); + uint32_t andorv =andor32(a,b,c); + + in[pc] = ssg21+inx2+ssg20+inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; } -static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, - uint32_t* in,uint32_t pc,const uint32_t Kshared) +__host__ __forceinline__ +static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t* in,uint32_t pc,const uint32_t Kshared) { -uint32_t t1,t2; - -int pcidx1 = (pc-2) & 0xF; -int pcidx2 = (pc-7) & 0xF; -int pcidx3 = (pc-15) & 0xF; -uint32_t inx0 = in[pc]; -uint32_t inx1 = in[pcidx1]; -uint32_t inx2 = in[pcidx2]; -uint32_t inx3 = in[pcidx3]; - - -uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); -uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); -uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); -uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); -uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); -uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); - -in[pc] = ssg21+inx2+ssg20+inx0; - -t1 = h + bsg21 + vxandx + Kshared + in[pc]; -t2 = bsg20 + andorv; -d = d + t1; -h = t1 + t2; - + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); + uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + in[pc] = ssg21+inx2+ssg20+inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; } -static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) +__device__ __forceinline__ +static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) { - - - uint32_t a=r[0]; - uint32_t b=r[1]; - uint32_t c=r[2]; - uint32_t d=r[3]; - uint32_t e=r[4]; - uint32_t f=r[5]; - uint32_t g=r[6]; - uint32_t h=r[7]; - - sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]); - sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]); - sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]); - sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]); - sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]); - sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]); - sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]); - sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]); - sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]); - sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]); - sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]); - sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]); - sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]); - sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]); - sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]); - sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); - -#pragma unroll 3 - for (int i=0;i<3;i++) { - + uint32_t a=r[0]; + uint32_t b=r[1]; + uint32_t c=r[2]; + uint32_t d=r[3]; + uint32_t e=r[4]; + uint32_t f=r[5]; + uint32_t g=r[6]; + uint32_t h=r[7]; + + sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]); + sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]); + sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]); + sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]); + sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]); + sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]); + sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]); + sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]); + sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]); + sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]); + sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]); + sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]); + sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]); + sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]); + sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]); + sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); + + #pragma unroll 3 + for (int i=0;i<3;i++) + { sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); @@ -239,54 +243,50 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } - } - - - - r[0] = r[0] + a; - r[1] = r[1] + b; - r[2] = r[2] + c; - r[3] = r[3] + d; - r[4] = r[4] + e; - r[5] = r[5] + f; - r[6] = r[6] + g; - r[7] = r[7] + h; + r[0] = r[0] + a; + r[1] = r[1] + b; + r[2] = r[2] + c; + r[3] = r[3] + d; + r[4] = r[4] + e; + r[5] = r[5] + f; + r[6] = r[6] + g; + r[7] = r[7] + h; } -static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) +__forceinline__ +static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) { - - - uint32_t a=r[0]; - uint32_t b=r[1]; - uint32_t c=r[2]; - uint32_t d=r[3]; - uint32_t e=r[4]; - uint32_t f=r[5]; - uint32_t g=r[6]; - uint32_t h=r[7]; - - sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]); - sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]); - sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]); - sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]); - sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]); - sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]); - sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]); - sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]); - sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]); - sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]); - sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]); - sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]); - sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]); - sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]); - sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]); - sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); - -#pragma unroll 3 - for (int i=0;i<3;i++) { - + uint32_t a=r[0]; + uint32_t b=r[1]; + uint32_t c=r[2]; + uint32_t d=r[3]; + uint32_t e=r[4]; + uint32_t f=r[5]; + uint32_t g=r[6]; + uint32_t h=r[7]; + + sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]); + sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]); + sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]); + sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]); + sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]); + sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]); + sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]); + sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]); + sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]); + sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]); + sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]); + sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]); + sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]); + sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]); + sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]); + sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); + + #pragma unroll 3 + for (int i=0;i<3;i++) + { sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); @@ -303,100 +303,64 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } - } - - r[0] = r[0] + a; - r[1] = r[1] + b; - r[2] = r[2] + c; - r[3] = r[3] + d; - r[4] = r[4] + e; - r[5] = r[5] + f; - r[6] = r[6] + g; - r[7] = r[7] + h; + r[0] = r[0] + a; + r[1] = r[1] + b; + r[2] = r[2] + c; + r[3] = r[3] + d; + r[4] = r[4] + e; + r[5] = r[5] + f; + r[6] = r[6] + g; + r[7] = r[7] + h; } - -__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) +__global__ +void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) { -/* - __shared__ uint32_t Kshared[64]; - if (threadIdx.x < 64) { - Kshared[threadIdx.x]=K[threadIdx.x]; - } - __syncthreads(); -*/ -union { -uint8_t h1[64]; -uint32_t h4[16]; -uint64_t h8[8]; -} hash; -//uint32_t buf[8]; - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { uint32_t nounce = startNounce + thread ; // original implementation - uint32_t buf[8]; + uint32_t buf[8]; uint32_t in2[16]={0}; uint32_t in3[16]={0}; - #pragma unroll 13 - for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);} + #pragma unroll 13 + for (int i=0; i<13; i++) + in2[i]= cuda_swab32(c_PaddedMessage80[i+16]); + in2[13]=cuda_swab32(nounce); in2[14]=cuda_swab32(c_PaddedMessage80[30]); - in3[15]=0x3d0; - - #pragma unroll 8 - for (int i=0;i<8;i++) {buf[i]= pbuf[i];} - - sha2_round_body(in2,buf,K); - sha2_round_body(in3,buf,K); -//#pragma unroll 8 -//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);} - -#pragma unroll 4 -for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);} + in3[15]=0x3d0; + #pragma unroll 8 + for (int i=0; i<8; i++) + buf[i] = pbuf[i]; -////////////////////////////////////////////////////////////////////////////////////////////////// - } // threads + sha2_round_body(in2,buf,K); + sha2_round_body(in3,buf,K); + #pragma unroll 4 + for (int i=0; i<4; i++) { + outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]); + } + } // thread } - -__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) +__global__ +void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) { -/* - __shared__ uint32_t Kshared[64]; - if (threadIdx.x < 64) { - Kshared[threadIdx.x]=K[threadIdx.x]; - } - __syncthreads(); -*/ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - - - - -union { -uint8_t h1[304]; -uint32_t h4[76]; -uint64_t h8[38]; -} hash; - - - uint32_t in[16],buf[8]; - + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t in[16],buf[8]; #pragma unroll 8 for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);} - #pragma unroll 8 + #pragma unroll 8 for (int i=0;i<8;i++) {buf[i] = H256[i];} sha2_round_body(in,buf,K); @@ -415,96 +379,87 @@ uint64_t h8[38]; #pragma unroll 5 for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);} - ((uint64_t*)in)[5]= g_hash1[threads*(5+32)+thread]; + + ((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread]; in[11]=0; in[12]=0; in[13]=0; in[14]=0; + in[15]=0x968; - in[15]=0x968; + int it=0; - int it=0; - do { - in[15]-=8; - it++; - } while (((uint8_t*)in)[44-it]==0); - ((uint8_t*)in)[44-it+1]=0x80; + do { + in[15]-=8; + it++; + } while (((uint8_t*)in)[44-it]==0); - ((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); + ((uint8_t*)in)[44-it+1]=0x80; - sha2_round_body(in,buf,K); + ((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); -uint32_t nounce = startNounce +thread; - bool rc = false; + sha2_round_body(in,buf,K); + uint32_t nounce = startNounce +thread; + bool rc = false; #pragma unroll 4 for (int i = 0; i < 4; i++) { if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) { - if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;} -// if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} + if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) + rc = true; + else + rc = false; + //if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} } } - - if(rc == true) - { - if(resNounce[0] > nounce) - resNounce[0] = nounce; - - } - - -//// - } // threads + if (rc && resNounce[0] > nounce) + resNounce[0] = nounce; + } // thread } - - -__host__ void m7_sha256_cpu_init(int thr_id, int threads) +__host__ +void m7_sha256_cpu_init(int thr_id, int threads) { - // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice ); cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t)); cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t)); } - -__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) +__host__ +uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) { - + const int threadsperblock = 384; uint32_t result = 0xffffffff; - cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t)); - const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN + cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t)); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); size_t shared_size = 0; - m7_sha256_gpu_hash_300<<>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]); + cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); MyStreamSynchronize(NULL, order, thr_id); + result = *d_mnounce[thr_id]; return result; } - -__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +__host__ +void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { + const int threadsperblock = 512; - const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN - - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); -// dim3 grid(1); -// dim3 block(1); + size_t shared_size = 0; m7_sha256_gpu_hash_120<<>>(threads, startNounce, d_outputHash); @@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun MyStreamSynchronize(NULL, order, thr_id); } -__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful +__host__ +void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful { unsigned char PaddedMessage[128]; uint8_t ending =0x80; @@ -527,6 +483,6 @@ __host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not use uint32_t in[16],buf[8]; for (int i=0;i<16;i++) {in[i]= host_swab32(alt_data[i]);} for (int i=0;i<8;i++) {buf[i]= cpu_H256[i];} - sha2_round_body_host(in,buf,cpu_K); - cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + sha2_round_body_host(in,buf,cpu_K); + cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); } diff --git a/m7/cuda_mul2.cu b/m7/cuda_mul2.cu index 7aa9a83..cf1af67 100644 --- a/m7/cuda_mul2.cu +++ b/m7/cuda_mul2.cu @@ -52,29 +52,29 @@ typedef struct t4_t{ } t4_t; __device__ __forceinline__ -ulonglong2 umul64wide (unsigned long long int a, - unsigned long long int b) +ulonglong2 umul64wide(unsigned long long int a, + unsigned long long int b) { - ulonglong2 res; - asm ("{\n\t" - ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t" - "mov.b64 {alo,ahi}, %2; \n\t" - "mov.b64 {blo,bhi}, %3; \n\t" - "mul.lo.u32 r0, alo, blo; \n\t" - "mul.hi.u32 r1, alo, blo; \n\t" - "mad.lo.cc.u32 r1, alo, bhi, r1;\n\t" - "madc.hi.u32 r2, alo, bhi, 0;\n\t" - "mad.lo.cc.u32 r1, ahi, blo, r1;\n\t" - "madc.hi.cc.u32 r2, ahi, blo, r2;\n\t" - "madc.hi.u32 r3, ahi, bhi, 0;\n\t" - "mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t" - "addc.u32 r3, r3, 0; \n\t" - "mov.b64 %0, {r0,r1}; \n\t" - "mov.b64 %1, {r2,r3}; \n\t" - "}" - : "=l"(res.x), "=l"(res.y) - : "l"(a), "l"(b)); - return res; + ulonglong2 res; + asm ("{\n\t" + ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t" + "mov.b64 {alo,ahi}, %2; \n\t" + "mov.b64 {blo,bhi}, %3; \n\t" + "mul.lo.u32 r0, alo, blo; \n\t" + "mul.hi.u32 r1, alo, blo; \n\t" + "mad.lo.cc.u32 r1, alo, bhi, r1;\n\t" + "madc.hi.u32 r2, alo, bhi, 0;\n\t" + "mad.lo.cc.u32 r1, ahi, blo, r1;\n\t" + "madc.hi.cc.u32 r2, ahi, blo, r2;\n\t" + "madc.hi.u32 r3, ahi, bhi, 0;\n\t" + "mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t" + "addc.u32 r3, r3, 0; \n\t" + "mov.b64 %0, {r0,r1}; \n\t" + "mov.b64 %1, {r2,r3}; \n\t" + "}" + : "=l"(res.x), "=l"(res.y) + : "l"(a), "l"(b)); + return res; } #define umul_ppmm(h,l,m,n) \ @@ -85,75 +85,76 @@ ulonglong2 umul64wide (unsigned long long int a, } -__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n) +__device__ __forceinline__ +void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n) { - asm ("{\n\t" - ".reg .u32 o0, o1, o2, o3, o4; \n\t" - ".reg .u32 o5, o6, o7, i8, i9; \n\t" - ".reg .u32 i10, i11, i12, i13; \n\t" - ".reg .u32 i14, i15, i16, i17; \n\t" - ".reg .u32 i18, i19, i20, i21; \n\t" - ".reg .u32 i22, i23; \n\t" - "mov.b64 { i8, i9}, %4; \n\t" - "mov.b64 {i10,i11}, %5; \n\t" - "mov.b64 {i12,i13}, %6; \n\t" - "mov.b64 {i14,i15}, %7; \n\t" - "mov.b64 {i16,i17}, %8; \n\t" - "mov.b64 {i18,i19}, %9; \n\t" - "mov.b64 {i20,i21},%10; \n\t" - "mov.b64 {i22,i23},%11; \n\t" - "mul.lo.u32 o0, i8, i16; \n\t" - "mul.hi.u32 o1, i8, i16; \n\t" - "mad.lo.cc.u32 o1, i8, i17, o1;\n\t" - "madc.hi.u32 o2, i8, i17, 0;\n\t" - "mad.lo.cc.u32 o1, i9, i16, o1;\n\t" - "madc.hi.cc.u32 o2, i9, i16, o2;\n\t" - "madc.hi.u32 o3, i8, i18, 0;\n\t" - "mad.lo.cc.u32 o2, i8, i18, o2;\n\t" - "madc.hi.cc.u32 o3, i9, i17, o3;\n\t" - "madc.hi.u32 o4, i8, i19, 0;\n\t" - "mad.lo.cc.u32 o2, i9, i17, o2;\n\t" - "madc.hi.cc.u32 o3, i10, i16, o3;\n\t" - "madc.hi.cc.u32 o4, i9, i18, o4;\n\t" - "addc.u32 o5, 0, 0;\n\t" - "mad.lo.cc.u32 o2, i10, i16, o2;\n\t" - "madc.lo.cc.u32 o3, i8, i19, o3;\n\t" - "madc.hi.cc.u32 o4, i10, i17, o4;\n\t" - "madc.hi.cc.u32 o5, i9, i19, o5;\n\t" - "addc.u32 o6, 0, 0;\n\t" - "mad.lo.cc.u32 o3, i9, i18, o3;\n\t" - "madc.hi.cc.u32 o4, i11, i16, o4;\n\t" - "madc.hi.cc.u32 o5, i10, i18, o5;\n\t" - "addc.u32 o6, 0, o6;\n\t" - "mad.lo.cc.u32 o3, i10, i17, o3;\n\t" - "addc.u32 o4, 0, o4;\n\t" - "mad.hi.cc.u32 o5, i11, i17, o5;\n\t" - "madc.hi.cc.u32 o6, i10, i19, o6;\n\t" - "addc.u32 o7, 0, 0;\n\t" - "mad.lo.cc.u32 o3, i11, i16, o3;\n\t" - "madc.lo.cc.u32 o4, i9, i19, o4;\n\t" - "addc.u32 o5, 0, o5;\n\t" - "mad.hi.cc.u32 o6, i11, i18, o6;\n\t" - "addc.u32 o7, 0, o7;\n\t" - "mad.lo.cc.u32 o4, i10, i18, o4;\n\t" - "addc.u32 o5, 0, o5;\n\t" - "mad.hi.u32 o7, i11, i19, o7;\n\t" - "mad.lo.cc.u32 o4, i11, i17, o4;\n\t" - "addc.u32 o5, 0, o5;\n\t" - "mad.lo.cc.u32 o5, i10, i19, o5;\n\t" - "addc.u32 o6, 0, o6;\n\t" - "mad.lo.cc.u32 o5, i11, i18, o5;\n\t" - "addc.u32 o6, 0, o6;\n\t" - "mad.lo.cc.u32 o6, i11, i19, o6;\n\t" - "addc.u32 o7, 0, o7;\n\t" - "mov.b64 %0, {o0,o1}; \n\t" - "mov.b64 %1, {o2,o3}; \n\t" - "mov.b64 %2, {o4,o5}; \n\t" - "mov.b64 %3, {o6,o7}; \n\t" - "}" - : "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high) - : "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL), - "l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL)); + asm ("{\n\t" + ".reg .u32 o0, o1, o2, o3, o4; \n\t" + ".reg .u32 o5, o6, o7, i8, i9; \n\t" + ".reg .u32 i10, i11, i12, i13; \n\t" + ".reg .u32 i14, i15, i16, i17; \n\t" + ".reg .u32 i18, i19, i20, i21; \n\t" + ".reg .u32 i22, i23; \n\t" + "mov.b64 { i8, i9}, %4; \n\t" + "mov.b64 {i10,i11}, %5; \n\t" + "mov.b64 {i12,i13}, %6; \n\t" + "mov.b64 {i14,i15}, %7; \n\t" + "mov.b64 {i16,i17}, %8; \n\t" + "mov.b64 {i18,i19}, %9; \n\t" + "mov.b64 {i20,i21},%10; \n\t" + "mov.b64 {i22,i23},%11; \n\t" + "mul.lo.u32 o0, i8, i16; \n\t" + "mul.hi.u32 o1, i8, i16; \n\t" + "mad.lo.cc.u32 o1, i8, i17, o1;\n\t" + "madc.hi.u32 o2, i8, i17, 0;\n\t" + "mad.lo.cc.u32 o1, i9, i16, o1;\n\t" + "madc.hi.cc.u32 o2, i9, i16, o2;\n\t" + "madc.hi.u32 o3, i8, i18, 0;\n\t" + "mad.lo.cc.u32 o2, i8, i18, o2;\n\t" + "madc.hi.cc.u32 o3, i9, i17, o3;\n\t" + "madc.hi.u32 o4, i8, i19, 0;\n\t" + "mad.lo.cc.u32 o2, i9, i17, o2;\n\t" + "madc.hi.cc.u32 o3, i10, i16, o3;\n\t" + "madc.hi.cc.u32 o4, i9, i18, o4;\n\t" + "addc.u32 o5, 0, 0;\n\t" + "mad.lo.cc.u32 o2, i10, i16, o2;\n\t" + "madc.lo.cc.u32 o3, i8, i19, o3;\n\t" + "madc.hi.cc.u32 o4, i10, i17, o4;\n\t" + "madc.hi.cc.u32 o5, i9, i19, o5;\n\t" + "addc.u32 o6, 0, 0;\n\t" + "mad.lo.cc.u32 o3, i9, i18, o3;\n\t" + "madc.hi.cc.u32 o4, i11, i16, o4;\n\t" + "madc.hi.cc.u32 o5, i10, i18, o5;\n\t" + "addc.u32 o6, 0, o6;\n\t" + "mad.lo.cc.u32 o3, i10, i17, o3;\n\t" + "addc.u32 o4, 0, o4;\n\t" + "mad.hi.cc.u32 o5, i11, i17, o5;\n\t" + "madc.hi.cc.u32 o6, i10, i19, o6;\n\t" + "addc.u32 o7, 0, 0;\n\t" + "mad.lo.cc.u32 o3, i11, i16, o3;\n\t" + "madc.lo.cc.u32 o4, i9, i19, o4;\n\t" + "addc.u32 o5, 0, o5;\n\t" + "mad.hi.cc.u32 o6, i11, i18, o6;\n\t" + "addc.u32 o7, 0, o7;\n\t" + "mad.lo.cc.u32 o4, i10, i18, o4;\n\t" + "addc.u32 o5, 0, o5;\n\t" + "mad.hi.u32 o7, i11, i19, o7;\n\t" + "mad.lo.cc.u32 o4, i11, i17, o4;\n\t" + "addc.u32 o5, 0, o5;\n\t" + "mad.lo.cc.u32 o5, i10, i19, o5;\n\t" + "addc.u32 o6, 0, o6;\n\t" + "mad.lo.cc.u32 o5, i11, i18, o5;\n\t" + "addc.u32 o6, 0, o6;\n\t" + "mad.lo.cc.u32 o6, i11, i19, o6;\n\t" + "addc.u32 o7, 0, o7;\n\t" + "mov.b64 %0, {o0,o1}; \n\t" + "mov.b64 %1, {o2,o3}; \n\t" + "mov.b64 %2, {o4,o5}; \n\t" + "mov.b64 %3, {o6,o7}; \n\t" + "}" + : "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high) + : "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL), + "l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL)); } #if 0 @@ -187,55 +188,60 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){ #endif -__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){ +__device__ __forceinline__ +t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){ t4_t ret; ret.high = g[(idx*2 + 1)*threads + thread]; ret.low = g[(idx*2)*threads + thread]; if(thread==0){ - // cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low); + // cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low); } return ret; } -__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){ +__device__ __forceinline__ +void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){ g[(idx*2 + 1)*threads + thread]=val.high; g[(idx*2)*threads + thread]=val.low; if(thread==0){ - // cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low); + // cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low); } } -__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){ +__device__ __forceinline__ +void T4_set(t4_t *d, uint64_t v){ d->high = 0; d->low = v; } -__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){ +__device__ __forceinline__ +t4_t T4_add(t4_t a, t4_t b){ t4_t ret; uint32_t c=0; ret.low = a.low + b.low; if(ret.low < a.low) - c=1; + c=1; ret.high = a.high + b.high + c; return ret; } -__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){ +__device__ __forceinline__ +t4_t T4_add(uint64_t a, t4_t b){ t4_t ret; uint32_t c=0; ret.low = a + b.low; if(ret.low < a) - c=1; + c=1; ret.high = b.high + c; return ret; } - -__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ +__device__ __forceinline__ +uint32_t T4_lt(t4_t a, t4_t b){ if(a.high < b.high) return 1; if(a.high == b.high && a.low < b.low) @@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ return 0; } -__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ +__device__ __forceinline__ +uint32_t T4_gt(t4_t a, uint64_t b){ if(a.high) return 1; if(a.low > b) @@ -252,217 +259,213 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ } -__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){ - t4_t ul, cl, hpl, lpl; - uint32_t i; - T4_set(&cl,0); - for(i=0; i < len; i++) { - ul = T4(thread,threads,i,g_v); - umul_ppmmT4 (&hpl, &lpl, ul, sml); +__device__ +void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){ + t4_t ul, cl, hpl, lpl; + uint32_t i; + T4_set(&cl,0); + for(i=0; i < len; i++) { + ul = T4(thread,threads,i,g_v); + umul_ppmmT4 (&hpl, &lpl, ul, sml); - lpl = T4_add(lpl,cl); - cl = T4_add(T4_lt(lpl,cl),hpl); + lpl = T4_add(lpl,cl); + cl = T4_add(T4_lt(lpl,cl),hpl); - T4_store(thread,threads,i,g_p,lpl); - } + T4_store(thread,threads,i,g_p,lpl); + } - T4_store(thread,threads,len,g_p,cl); - *size = len + T4_gt(cl,0); + T4_store(thread,threads,len,g_p,cl); + *size = len + T4_gt(cl,0); } __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, uint64_t sml, uint32_t *size){ - uint64_t ul, cl, hpl, lpl; - uint32_t i; - cl = 0; - for(i=0; i < len; i++) { - ul = g_v[i*threads + thread]; - umul_ppmm (hpl, lpl, ul, sml); + uint64_t ul, cl, hpl, lpl; + uint32_t i; + cl = 0; + for(i=0; i < len; i++) { + ul = g_v[i*threads + thread]; + umul_ppmm (hpl, lpl, ul, sml); - lpl += cl; - cl = (lpl < cl) + hpl; + lpl += cl; + cl = (lpl < cl) + hpl; - g_p[i*threads + thread] = lpl; - } + g_p[i*threads + thread] = lpl; + } - g_p[len*threads + thread] = cl; - *size = len + (cl != 0); + g_p[len*threads + thread] = cl; + *size = len + (cl != 0); } -uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){ +uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a) +{ uint64_t carry=0; uint32_t i; uint64_t ul,lpl,hpl,rl; - for(i=0; i < xsz; i++){ + for(i=0; i < xsz; i++) + { + ul = x[i*threads + thread]; + umul_ppmm (hpl, lpl, ul, a); - ul = x[i*threads + thread]; - umul_ppmm (hpl, lpl, ul, a); + lpl += carry; + carry = (lpl < carry) + hpl; - lpl += carry; - carry = (lpl < carry) + hpl; - - rl = sum[(i+sofst) * threads + thread]; - lpl = rl + lpl; - carry += lpl < rl; - sum[(i+sofst)*threads + thread] = lpl; - } + rl = sum[(i+sofst) * threads + thread]; + lpl = rl + lpl; + carry += lpl < rl; + sum[(i+sofst)*threads + thread] = lpl; + } - return carry; + return carry; } -t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){ +__device__ +t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a) +{ t4_t carry; uint32_t i; t4_t ul,lpl,hpl,rl; T4_set(&carry,0); - for(i=0; i < xsz; i++){ - - ul = T4(thread,threads,i,x); - umul_ppmmT4 (&hpl, &lpl, ul, a); - - lpl = T4_add(lpl,carry); - carry = T4_add(T4_lt(lpl,carry), hpl); - - rl = T4(thread,threads,i+sofst,sum); - lpl = T4_add(rl,lpl); - carry = T4_add(T4_lt(lpl,rl),carry); - T4_store(thread,threads,i+sofst,sum,lpl); - } + for(i=0; i < xsz; i++) + { + ul = T4(thread,threads,i,x); + umul_ppmmT4 (&hpl, &lpl, ul, a); + + lpl = T4_add(lpl,carry); + carry = T4_add(T4_lt(lpl,carry), hpl); + + rl = T4(thread,threads,i+sofst,sum); + lpl = T4_add(rl,lpl); + carry = T4_add(T4_lt(lpl,rl),carry); + T4_store(thread,threads,i+sofst,sum,lpl); + } - return carry; + return carry; } - - -__global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) +__global__ +void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - if(ulegs < vlegs){ - uint64_t t1=ulegs; - ulegs = vlegs; - vlegs = t1; - - uint64_t *t2 = g_u; - g_u = g_v; - g_v = t2; - } - - uint32_t vofst=1,rofst=1,psize=0; - mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + if(ulegs < vlegs) { + uint64_t t1=ulegs; + ulegs = vlegs; + vlegs = t1; + + uint64_t *t2 = g_u; + g_u = g_v; + g_v = t2; + } + + uint32_t vofst=1,rofst=1,psize=0; + mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize); #if 1 - - while (vofst < vlegs) { + while (vofst < vlegs) { //clear high word //TODO: right - // printf("Size: %d\n", rp->size[tid]); - g_p[(psize+0)*threads+thread] = 0; - - g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]); - - vofst++; rofst++; - psize++; - } + // printf("Size: %d\n", rp->size[tid]); + g_p[(psize+0)*threads + thread] = 0; -// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0) -// rp->size[tid]++; + g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]); + vofst++; rofst++; + psize++; + } +// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0) +// rp->size[tid]++; #endif - } + } } -__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) +__global__ +void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - - if(ulegs < vlegs){ ///everything written the other way around... are you kidding me ?! - uint64_t t1=ulegs; - ulegs = vlegs; - vlegs = t1; - - uint64_t *t2 = g_u; - g_u = g_v; - g_v = t2; - } + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + if(ulegs < vlegs) { // everything written the other way around... are you kidding me ?! + uint64_t t1=ulegs; + ulegs = vlegs; + vlegs = t1; - ulegs >>= 1; vlegs >>= 1; + uint64_t *t2 = g_u; + g_u = g_v; + g_v = t2; + } - if(thread == 0){ - // cuPrintf("U: %d V: %d\n", ulegs, vlegs); - } + ulegs >>= 1; vlegs >>= 1; + if(thread == 0) { + // cuPrintf("U: %d V: %d\n", ulegs, vlegs); + } - - uint32_t vofst=1,rofst=1,psize=0; - mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize); + uint32_t vofst=1,rofst=1,psize=0; + mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize); #if 1 - t4_t zero; - T4_set(&zero,0); - + t4_t zero; + T4_set(&zero,0); +// while (vofst < vlegs) { -// while (vofst < vlegs) { - -#pragma unroll - for (vofst=1;vofst>>(threads, alegs, blegs, g_a, g_b, g_p) ; + size_t shared_size = 0; + gpu_mul<<>>(threads, alegs, blegs, g_a, g_b, g_p) ; } -__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order) +__host__ +void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order) { - const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's) // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size =0; - //gpu_mulT4<<>>(threads, alegs, blegs, g_a, g_b, g_p) ; + size_t shared_size = 0; + + //gpu_mulT4<<>>(threads, alegs, blegs, g_a, g_b, g_p) ; gpu_mulT4<<>>(threads, blegs, alegs, g_b, g_a, g_p) ; } -__host__ void mul_init(){ +__host__ +void mul_init() +{ } diff --git a/m7/cuda_ripemd160.cu b/m7/cuda_ripemd160.cu index 8bc2d6c..240ef14 100644 --- a/m7/cuda_ripemd160.cu +++ b/m7/cuda_ripemd160.cu @@ -48,7 +48,7 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) +__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) static __constant__ uint32_t gpu_IV[5]; static __constant__ uint32_t bufo[5]; static const uint32_t IV[5] = { @@ -282,118 +282,116 @@ static const uint32_t IV[5] = { (h)[0] = tmp; \ } - -__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) +__global__ +void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) { + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + union { + uint8_t h1[64]; + uint32_t h4[16]; + uint64_t h8[8]; + } hash; + + #undef F1 + #undef F2 + #undef F3 + #undef F4 + #undef F5 + + #define F1(x, y, z) xor3(x,y,z) + #define F2(x, y, z) xandx(x,y,z) + #define F3(x, y, z) xornot64(x,y,z) + #define F4(x, y, z) xandx(z,x,y) + #define F5(x, y, z) xornt64(x,y,z) + + uint32_t in2[16],in3[16]; + uint32_t in[16],buf[5]; + #pragma unroll 16 + for (int i=0;i<16;i++) { + if ((i+16) < 29) + in2[i] = c_PaddedMessage80[i+16]; + else if ((i+16)==29) + in2[i] = nounce; + else if ((i+16)==30) + in2[i] = c_PaddedMessage80[i+16]; + else + in2[i] = 0; + } - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - - uint32_t nounce = startNounce + thread ; -union { -uint8_t h1[64]; -uint32_t h4[16]; -uint64_t h8[8]; -} hash; - -#undef F1 -#undef F2 -#undef F3 -#undef F4 -#undef F5 - -#define F1(x, y, z) xor3(x,y,z) -#define F2(x, y, z) xandx(x,y,z) -#define F3(x, y, z) xornot64(x,y,z) -#define F4(x, y, z) xandx(z,x,y) -#define F5(x, y, z) xornt64(x,y,z) - uint32_t in2[16],in3[16]; - uint32_t in[16],buf[5]; -// #pragma unroll 16 -// for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];} - #pragma unroll 16 - for (int i=0;i<16;i++) {if ((i+16)<29) {in2[i]= c_PaddedMessage80[i+16];} - else if ((i+16)==29) {in2[i]= nounce;} - else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];} - else {in2[i]= 0;}} #pragma unroll 16 - for (int i=0;i<16;i++) {in3[i]=0;} - in3[14]=0x3d0; -// #pragma unroll 5 -// for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];} - #pragma unroll 5 - for (int i=0;i<5;i++) {buf[i]=bufo[i];} -// RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) - RIPEMD160_ROUND_BODY(in2, buf); - RIPEMD160_ROUND_BODY(in3, buf); - - -hash.h4[5]=0; -#pragma unroll 5 -for (int i=0;i<5;i++) -{hash.h4[i]=buf[i]; -} -//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; -//#pragma unroll 3 -//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];} -#pragma unroll 3 -for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];} -//#pragma unroll 8 -//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}} - } + for (int i=0;i<16;i++) + in3[i]=0; + in3[14]=0x3d0; + + #pragma unroll 5 + for (int i=0;i<5;i++) + buf[i]=bufo[i]; + + RIPEMD160_ROUND_BODY(in2, buf); + RIPEMD160_ROUND_BODY(in3, buf); + + hash.h4[5]=0; + #pragma unroll 5 + for (int i=0; i<5; i++) + hash.h4[i]=buf[i]; + + #pragma unroll 3 + for (int i=0;i<3;i++) { + outputHash[i*threads+thread] = hash.h8[i]; + } + } } - void ripemd160_cpu_init(int thr_id, int threads) { - - cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice); - + cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice); } -__host__ void ripemd160_setBlock_120(void *pdata) +__host__ +void ripemd160_setBlock_120(void *pdata) { unsigned char PaddedMessage[128]; uint8_t ending =0x80; memcpy(PaddedMessage, pdata, 122); memset(PaddedMessage+122,ending,1); memset(PaddedMessage+123, 0, 5); //useless - cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + + #undef F1 + #undef F2 + #undef F3 + #undef F4 + #undef F5 + #define F1(x, y, z) ((x) ^ (y) ^ (z)) + #define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) + #define F3(x, y, z) (((x) | ~(y)) ^ (z)) + #define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) + #define F5(x, y, z) ((x) ^ ((y) | ~(z))) -#undef F1 -#undef F2 -#undef F3 -#undef F4 -#undef F5 -#define F1(x, y, z) ((x) ^ (y) ^ (z)) -#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define F3(x, y, z) (((x) | ~(y)) ^ (z)) -#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) -#define F5(x, y, z) ((x) ^ ((y) | ~(z))) uint32_t* alt_data =(uint32_t*)pdata; - uint32_t in[16],buf[5]; + uint32_t in[16],buf[5]; + for (int i=0;i<16;i++) + in[i]= alt_data[i]; - for (int i=0;i<16;i++) {in[i]= alt_data[i];} + for (int i=0;i<5;i++) + buf[i]=IV[i]; - - for (int i=0;i<5;i++) {buf[i]=IV[i];} - - RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) + RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); } -__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +__host__ +void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { + const int threadsperblock = 256; - const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN - + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); -dim3 grid((threads + threadsperblock-1)/threadsperblock); -dim3 block(threadsperblock); -//dim3 grid(1); -//dim3 block(1); size_t shared_size =0; m7_ripemd160_gpu_hash_120<<>>(threads, startNounce, d_outputHash); diff --git a/m7/cuda_tiger192.cu b/m7/cuda_tiger192.cu index a28a66d..d9d1a5d 100644 --- a/m7/cuda_tiger192.cu +++ b/m7/cuda_tiger192.cu @@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t bufo[3]; + static __constant__ uint64_t gpu_III[3]; static __constant__ uint64_t T1[256]; static __constant__ uint64_t T2[256]; static __constant__ uint64_t T3[256]; static __constant__ uint64_t T4[256]; + static const uint64_t III[3] = { SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187) }; @@ -583,16 +585,16 @@ static const uint64_t cpu_T4[256] = { SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F) }; -#define PASS(a, b, c, mul) { \ - ROUND(a, b, c, X0, mul); \ - ROUND(b, c, a, X1, mul); \ - ROUND(c, a, b, X2, mul); \ - ROUND(a, b, c, X3, mul); \ - ROUND(b, c, a, X4, mul); \ - ROUND(c, a, b, X5, mul); \ - ROUND(a, b, c, X6, mul); \ - ROUND(b, c, a, X7, mul); \ - } +#define PASS(a, b, c, mul) { \ + ROUND(a, b, c, X0, mul); \ + ROUND(b, c, a, X1, mul); \ + ROUND(c, a, b, X2, mul); \ + ROUND(a, b, c, X3, mul); \ + ROUND(b, c, a, X4, mul); \ + ROUND(c, a, b, X5, mul); \ + ROUND(a, b, c, X6, mul); \ + ROUND(b, c, a, X7, mul); \ +} #define MUL5(x) SPH_T64((x) * SPH_C64(5)) #define MUL7(x) SPH_T64((x) * SPH_C64(7)) @@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = { (r)[2] = SPH_T64(C + (r)[2]); \ } - -__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) +__global__ +void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) { + __shared__ uint64_t sharedMem[1024]; - __shared__ uint64_t sharedMem[1024]; - if(threadIdx.x < 256) - { + if(threadIdx.x < 256) { sharedMem[threadIdx.x] = T1[threadIdx.x]; sharedMem[threadIdx.x+256] = T2[threadIdx.x]; sharedMem[threadIdx.x+512] = T3[threadIdx.x]; sharedMem[threadIdx.x+768] = T4[threadIdx.x]; } + __syncthreads(); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; -union { -uint8_t h1[64]; -uint32_t h4[16]; -uint64_t h8[8]; -} hash; /* #undef MUL5 #undef MUL7 @@ -680,7 +677,7 @@ uint64_t h8[8]; #define MUL7(x) mul(x,7) #define MUL9(x) mul(x,9) */ -#define PASS(a, b, c, mul) { \ +#define PASS(a, b, c, mul) { \ ROUND(a, b, c, X0, mul); \ ROUND(b, c, a, X1, mul); \ ROUND(c, a, b, X2, mul); \ @@ -691,89 +688,95 @@ uint64_t h8[8]; ROUND(b, c, a, X7, mul); \ } - - -#define ROUND(a, b, c, x, mul) { \ +#define ROUND(a, b, c, x, mul) { \ c ^= x; \ a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \ - ^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \ + ^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \ b = SPH_T64(b + (sharedMem[((c >> 8) & 0xFF)+768] ^ sharedMem[((c >> 24) & 0xFF)+512] \ ^ sharedMem[((c >> 40) & 0xFF)+256] ^ sharedMem[(c >> 56) & 0xFF])); \ b = mul(b); \ } - - uint64_t in[8],buf[3]; - uint64_t in2[8],in3[8]; + uint64_t in2[8]; #pragma unroll 8 - for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];} + for (int i=0; i<8; i++) + in2[i] = c_PaddedMessage80[i+8]; + uint32_t* Mess = (uint32_t*)in2; - Mess[13]=nounce; + Mess[13] = nounce; + + uint64_t in3[8]; #pragma unroll 8 - for (int i=0;i<8;i++) {in3[i]=0;} + for (int i=0; i<8; i++) + in3[i]=0; + in3[7]=0x3d0; - #pragma unroll 3 - for (int i=0;i<3;i++) {buf[i]=bufo[i];} + uint64_t buf[3]; + #pragma unroll 3 + for (int i=0; i<3; i++) + buf[i]=bufo[i]; - TIGER_ROUND_BODY(in2, buf); - TIGER_ROUND_BODY(in3, buf); + TIGER_ROUND_BODY(in2, buf); + TIGER_ROUND_BODY(in3, buf); -#pragma unroll 8 -for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}} - } //// threads + #pragma unroll 8 + for (int i=0;i<8;i++) { + if (i<3) { + outputHash[i*threads+thread] = buf[i]; + } else { + outputHash[i*threads+thread] = 0; + } + } + } // thread } - +__host__ void tiger192_cpu_init(int thr_id, int threads) { - cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice); - - - } -__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +__host__ +void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { + const int threadsperblock = 640; // 256 + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN -// const int threadsperblock = 256; + size_t shared_size = 0; -dim3 grid((threads + threadsperblock-1)/threadsperblock); -dim3 block(threadsperblock); -//dim3 grid(1); -//dim3 block(1); - size_t shared_size =0; m7_tiger192_gpu_hash_120<<>>(threads, startNounce, d_outputHash); MyStreamSynchronize(NULL, order, thr_id); } - -__host__ void tiger192_setBlock_120(void *pdata) +__host__ +void tiger192_setBlock_120(void *pdata) { unsigned char PaddedMessage[128]; uint8_t ending =0x01; + memcpy(PaddedMessage, pdata, 122); memset(PaddedMessage+122,ending,1); memset(PaddedMessage+123, 0, 5); //useless cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); -#undef ROUND -#undef MUL5 -#undef MUL7 -#undef MUL9 -#define MUL5(x) ((x) * SPH_C64(5)) -#define MUL7(x) ((x) * SPH_C64(7)) -#define MUL9(x) ((x) * SPH_C64(9)) + #undef ROUND + #undef MUL5 + #undef MUL7 + #undef MUL9 + #define MUL5(x) ((x) * SPH_C64(5)) + #define MUL7(x) ((x) * SPH_C64(7)) + #define MUL9(x) ((x) * SPH_C64(9)) -#define ROUND(a, b, c, x, mul) { \ + #define ROUND(a, b, c, x, mul) { \ c ^= x; \ a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \ ^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \ @@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata) b = mul(b); \ } - uint64_t* alt_data = (uint64_t*) pdata; - uint64_t in[8],buf[3]; - for (int i=0;i<8;i++) {in[i]= alt_data[i];} - for (int i=0;i<3;i++) {buf[i]=III[i];} + uint64_t in[8],buf[3]; + + for (int i=0;i<8;i++) + in[i] = alt_data[i]; - TIGER_ROUND_BODY(in, buf) - cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + for (int i=0;i<3;i++) + buf[i] = III[i]; + TIGER_ROUND_BODY(in, buf) + cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } \ No newline at end of file diff --git a/m7/m7.cu b/m7/m7.cu index 1c13a21..94da8fc 100644 --- a/m7/m7.cu +++ b/m7/m7.cu @@ -22,7 +22,7 @@ extern "C" extern int device_map[8]; extern bool opt_benchmark; -static uint64_t *d_hash[8]; +//static uint64_t *d_hash[8]; static uint64_t *FinalHash[8]; static uint64_t *KeccakH[8]; static uint64_t *WhirlpoolH[8]; @@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de { // sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool) - char data_str[245], hash_str[65], target_str[65]; uint8_t *bdata = 0; mpz_t bns[7]; mpz_t product; - int rc = 0; for(int i=0; i < 7; i++) { mpz_init(bns[i]); @@ -292,44 +290,42 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata, tiger192_setBlock_120((void*)pdata); cuda_check_cpu_setTarget(ptarget); - uint32_t TheNonce = pdata[29]; do { int order = 0; uint32_t foundNonce; - m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++); + m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++); - m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++); + m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++); - m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++); + m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++); m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++); m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++); - m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++); + m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++); - m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++); + m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++); - cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64 - MyStreamSynchronize(0,order++,thr_id); + cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64 + MyStreamSynchronize(0,order++,thr_id); - cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128 - MyStreamSynchronize(0,order++,thr_id); + cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128 + MyStreamSynchronize(0,order++,thr_id); - cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96 - MyStreamSynchronize(0,order++,thr_id); + cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96 + MyStreamSynchronize(0,order++,thr_id); - cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112 - MyStreamSynchronize(0,order++,thr_id); + cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112 + MyStreamSynchronize(0,order++,thr_id); - m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); - MyStreamSynchronize(0,order++,thr_id); + m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); + MyStreamSynchronize(0,order++,thr_id); - m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); - - MyStreamSynchronize(0,order++,thr_id); + m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); + MyStreamSynchronize(0,order++,thr_id); foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order); if (foundNonce != 0xffffffff) diff --git a/m7/m7_keccak512.cu b/m7/m7_keccak512.cu index a79682b..8688fa6 100644 --- a/m7/m7_keccak512.cu +++ b/m7/m7_keccak512.cu @@ -5,6 +5,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) + static __constant__ uint64_t stateo[25]; static __constant__ uint64_t RC[24]; static const uint64_t cpu_RC[24] = { @@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = { 0x0000000080000001ull, 0x8000000080008008ull }; -static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { +__device__ __forceinline__ +static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) +{ size_t i; uint64_t t[5], u[5], v, w; @@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t } } - -static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) { +__host__ __forceinline__ +static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) +{ size_t i; uint64_t t[5], u[5], v, w; @@ -204,25 +209,18 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca } } - - - __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) - - - -__global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) +__global__ /* __launch_bounds__(256, 2) */ +void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; - uint64_t state[25]; + uint64_t state[25]; #pragma unroll 16 - for (int i=9;i<25;i++) {state[i]=stateo[i];} + for (int i=9;i<25;i++) {state[i]=stateo[i];} state[0] = xor1(stateo[0],c_PaddedMessage80[9]); state[1] = xor1(stateo[1],c_PaddedMessage80[10]); @@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin keccak_block(state,RC); -#pragma unroll 8 -for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];} - - + #pragma unroll 8 + for (int i=0;i<8;i++) { + outputHash[i*threads+thread] = state[i]; + } } //thread } void m7_keccak512_cpu_init(int thr_id, int threads) { - cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice); } __host__ void m7_keccak512_setBlock_120(void *pdata) { - unsigned char PaddedMessage[128]; uint8_t ending =0x01; + memcpy(PaddedMessage, pdata, 122); memset(PaddedMessage+122,ending,1); memset(PaddedMessage+123, 0, 5); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); uint64_t* alt_data = (uint64_t*) pdata; - uint64_t state[25]; - for(int i=0;i<25;i++) {state[i]=0;} - - - for (int i=0;i<9;i++) {state[i] ^= alt_data[i];} - keccak_block_host(state,cpu_RC); - - cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); - + uint64_t state[25]; + for(int i=0;i<9;i++) + state[i] = alt_data[i]; + for(int i=10;i<25;i++) + state[i] = 0; + keccak_block_host(state,cpu_RC); + + cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); }