Browse Source

m7: forgot to indent some cuda files

and remove unused variables...
Tanguy Pruvot 10 years ago
parent
commit
fcd381cda2
  1. 518
      m7/cuda_m7_sha256.cu
  2. 479
      m7/cuda_mul2.cu
  3. 172
      m7/cuda_ripemd160.cu
  4. 141
      m7/cuda_tiger192.cu
  5. 40
      m7/m7.cu
  6. 54
      m7/m7_keccak512.cu

518
m7/cuda_m7_sha256.cu

@ -1,10 +1,10 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include "sph/sph_types.h" #include "sph/sph_types.h"
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//#define SPH_C64(x) ((uint64_t)(x ## ULL)) //#define SPH_C64(x) ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
}; };
static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x) __device__ __forceinline__
static uint32_t bsg2_0(uint32_t x)
{ {
uint32_t r1 = SPH_ROTR32(x,2); uint32_t r1 = SPH_ROTR32(x,2);
uint32_t r2 = SPH_ROTR32(x,13); uint32_t r2 = SPH_ROTR32(x,13);
uint32_t r3 = SPH_ROTR32(x,22); uint32_t r3 = SPH_ROTR32(x,22);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_1(uint32_t x)
{ {
uint32_t r1 = SPH_ROTR32(x,6); uint32_t r1 = SPH_ROTR32(x,6);
uint32_t r2 = SPH_ROTR32(x,11); uint32_t r2 = SPH_ROTR32(x,11);
uint32_t r3 = SPH_ROTR32(x,25); uint32_t r3 = SPH_ROTR32(x,25);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_0(uint32_t x)
{ {
uint64_t r1 = SPH_ROTR32(x,7); uint64_t r1 = SPH_ROTR32(x,7);
uint64_t r2 = SPH_ROTR32(x,18); uint64_t r2 = SPH_ROTR32(x,18);
uint64_t r3 = shr_t32(x,3); uint64_t r3 = shr_t32(x,3);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_1(uint32_t x)
{ {
uint64_t r1 = SPH_ROTR32(x,17); uint64_t r1 = SPH_ROTR32(x,17);
uint64_t r2 = SPH_ROTR32(x,19); uint64_t r2 = SPH_ROTR32(x,19);
@ -96,133 +103,130 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __device__ __forceinline__
uint32_t in,const uint32_t Kshared) static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g); uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e); uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a); uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c); uint32_t andorv =andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in; t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv; t2 = bsg20 + andorv;
d = d + t1; d = d + t1;
h = t1 + t2; h = t1 + t2;
} }
static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __host__ __forceinline__
uint32_t in,const uint32_t Kshared) static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{ {
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t t1,t2; uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); t1 = h + bsg21 + vxandx + Kshared + in;
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); t2 = bsg20 + andorv;
d = d + t1;
t1 = h + bsg21 + vxandx + Kshared + in; h = t1 + t2;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
} }
static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __device__ __forceinline__
uint32_t* in,uint32_t pc,const uint32_t Kshared) static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF; int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF; int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF; int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc]; uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1]; uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2]; uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3]; uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1); uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3); uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g); uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e); uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a); uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c); uint32_t andorv =andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0; in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc]; t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv; t2 = bsg20 + andorv;
d = d + t1; d = d + t1;
h = t1 + t2; h = t1 + t2;
} }
static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __host__ __forceinline__
uint32_t* in,uint32_t pc,const uint32_t Kshared) static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF; int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF; int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF; int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc]; uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1]; uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2]; uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3]; uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t1 = h + bsg21 + vxandx + Kshared + in[pc]; t2 = bsg20 + andorv;
t2 = bsg20 + andorv; d = d + t1;
d = d + t1; h = t1 + t2;
h = t1 + t2;
} }
static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) __device__ __forceinline__
static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{ {
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t a=r[0]; uint32_t c=r[2];
uint32_t b=r[1]; uint32_t d=r[3];
uint32_t c=r[2]; uint32_t e=r[4];
uint32_t d=r[3]; uint32_t f=r[5];
uint32_t e=r[4]; uint32_t g=r[6];
uint32_t f=r[5]; uint32_t h=r[7];
uint32_t g=r[6];
uint32_t h=r[7]; sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]); sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]); sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]); sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]); sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]); sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]); sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]); sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]); sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]); sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]); sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]); sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]); sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]); sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]); sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); #pragma unroll 3
for (int i=0;i<3;i++)
#pragma unroll 3 {
for (int i=0;i<3;i++) {
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,54 +243,50 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
} r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
r[3] = r[3] + d;
r[0] = r[0] + a; r[4] = r[4] + e;
r[1] = r[1] + b; r[5] = r[5] + f;
r[2] = r[2] + c; r[6] = r[6] + g;
r[3] = r[3] + d; r[7] = r[7] + h;
r[4] = r[4] + e;
r[5] = r[5] + f;
r[6] = r[6] + g;
r[7] = r[7] + h;
} }
static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) __forceinline__
static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{ {
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t a=r[0]; uint32_t c=r[2];
uint32_t b=r[1]; uint32_t d=r[3];
uint32_t c=r[2]; uint32_t e=r[4];
uint32_t d=r[3]; uint32_t f=r[5];
uint32_t e=r[4]; uint32_t g=r[6];
uint32_t f=r[5]; uint32_t h=r[7];
uint32_t g=r[6];
uint32_t h=r[7]; sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]); sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]); sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]); sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]); sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]); sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]); sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]); sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]); sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]); sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]); sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]); sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]); sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]); sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]); sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); #pragma unroll 3
for (int i=0;i<3;i++)
#pragma unroll 3 {
for (int i=0;i<3;i++) {
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,100 +303,64 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
} r[0] = r[0] + a;
r[1] = r[1] + b;
r[0] = r[0] + a; r[2] = r[2] + c;
r[1] = r[1] + b; r[3] = r[3] + d;
r[2] = r[2] + c; r[4] = r[4] + e;
r[3] = r[3] + d; r[5] = r[5] + f;
r[4] = r[4] + e; r[6] = r[6] + g;
r[5] = r[5] + f; r[7] = r[7] + h;
r[6] = r[6] + g;
r[7] = r[7] + h;
} }
__global__
__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
/* int thread = (blockDim.x * blockIdx.x + threadIdx.x);
__shared__ uint32_t Kshared[64]; if (thread < threads)
if (threadIdx.x < 64) { {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
//uint32_t buf[8];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread ; // original implementation uint32_t nounce = startNounce + thread ; // original implementation
uint32_t buf[8]; uint32_t buf[8];
uint32_t in2[16]={0}; uint32_t in2[16]={0};
uint32_t in3[16]={0}; uint32_t in3[16]={0};
#pragma unroll 13 #pragma unroll 13
for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);} for (int i=0; i<13; i++)
in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
in2[13]=cuda_swab32(nounce); in2[13]=cuda_swab32(nounce);
in2[14]=cuda_swab32(c_PaddedMessage80[30]); in2[14]=cuda_swab32(c_PaddedMessage80[30]);
in3[15]=0x3d0; in3[15]=0x3d0;
#pragma unroll 8
for (int i=0;i<8;i++) {buf[i]= pbuf[i];}
sha2_round_body(in2,buf,K);
sha2_round_body(in3,buf,K);
//#pragma unroll 8
//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
#pragma unroll 4
for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
#pragma unroll 8
for (int i=0; i<8; i++)
buf[i] = pbuf[i];
////////////////////////////////////////////////////////////////////////////////////////////////// sha2_round_body(in2,buf,K);
} // threads sha2_round_body(in3,buf,K);
#pragma unroll 4
for (int i=0; i<4; i++) {
outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
}
} // thread
} }
__global__
__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
{ {
/* int thread = (blockDim.x * blockIdx.x + threadIdx.x);
__shared__ uint32_t Kshared[64]; if (thread < threads)
if (threadIdx.x < 64) { {
Kshared[threadIdx.x]=K[threadIdx.x]; uint32_t in[16],buf[8];
}
__syncthreads();
*/
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
union {
uint8_t h1[304];
uint32_t h4[76];
uint64_t h8[38];
} hash;
uint32_t in[16],buf[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);} for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {buf[i] = H256[i];} for (int i=0;i<8;i++) {buf[i] = H256[i];}
sha2_round_body(in,buf,K); sha2_round_body(in,buf,K);
@ -415,96 +379,87 @@ uint64_t h8[38];
#pragma unroll 5 #pragma unroll 5
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);} for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
((uint64_t*)in)[5]= g_hash1[threads*(5+32)+thread];
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
in[11]=0; in[11]=0;
in[12]=0; in[12]=0;
in[13]=0; in[13]=0;
in[14]=0; in[14]=0;
in[15]=0x968;
in[15]=0x968; int it=0;
int it=0; do {
do { in[15]-=8;
in[15]-=8; it++;
it++; } while (((uint8_t*)in)[44-it]==0);
} while (((uint8_t*)in)[44-it]==0);
((uint8_t*)in)[44-it+1]=0x80;
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); ((uint8_t*)in)[44-it+1]=0x80;
sha2_round_body(in,buf,K); ((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
uint32_t nounce = startNounce +thread; sha2_round_body(in,buf,K);
bool rc = false;
uint32_t nounce = startNounce +thread;
bool rc = false;
#pragma unroll 4 #pragma unroll 4
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
{ {
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) { if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;} if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
// if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} rc = true;
else
rc = false;
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
} }
} }
if (rc && resNounce[0] > nounce)
if(rc == true) resNounce[0] = nounce;
{ } // thread
if(resNounce[0] > nounce)
resNounce[0] = nounce;
}
////
} // threads
} }
__host__
void m7_sha256_cpu_init(int thr_id, int threads)
__host__ void m7_sha256_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t)); cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
} }
__host__
__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
{ {
const int threadsperblock = 384;
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size = 0; size_t shared_size = 0;
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]); m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
result = *d_mnounce[thr_id]; result = *d_mnounce[thr_id];
return result; return result;
} }
__host__
__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 512;
const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// dim3 grid(1);
// dim3 block(1);
size_t shared_size = 0; size_t shared_size = 0;
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful __host__
void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x80; uint8_t ending =0x80;
@ -527,6 +483,6 @@ __host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not use
uint32_t in[16],buf[8]; uint32_t in[16],buf[8];
for (int i=0;i<16;i++) {in[i]= host_swab32(alt_data[i]);} for (int i=0;i<16;i++) {in[i]= host_swab32(alt_data[i]);}
for (int i=0;i<8;i++) {buf[i]= cpu_H256[i];} for (int i=0;i<8;i++) {buf[i]= cpu_H256[i];}
sha2_round_body_host(in,buf,cpu_K); sha2_round_body_host(in,buf,cpu_K);
cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }

479
m7/cuda_mul2.cu

@ -52,29 +52,29 @@ typedef struct t4_t{
} t4_t; } t4_t;
__device__ __forceinline__ __device__ __forceinline__
ulonglong2 umul64wide (unsigned long long int a, ulonglong2 umul64wide(unsigned long long int a,
unsigned long long int b) unsigned long long int b)
{ {
ulonglong2 res; ulonglong2 res;
asm ("{\n\t" asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t" ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2; \n\t" "mov.b64 {alo,ahi}, %2; \n\t"
"mov.b64 {blo,bhi}, %3; \n\t" "mov.b64 {blo,bhi}, %3; \n\t"
"mul.lo.u32 r0, alo, blo; \n\t" "mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t" "mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t" "mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t" "madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t" "mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t" "madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t" "madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t" "mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0; \n\t" "addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t" "mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t" "mov.b64 %1, {r2,r3}; \n\t"
"}" "}"
: "=l"(res.x), "=l"(res.y) : "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b)); : "l"(a), "l"(b));
return res; return res;
} }
#define umul_ppmm(h,l,m,n) \ #define umul_ppmm(h,l,m,n) \
@ -85,75 +85,76 @@ ulonglong2 umul64wide (unsigned long long int a,
} }
__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n) __device__ __forceinline__
void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
{ {
asm ("{\n\t" asm ("{\n\t"
".reg .u32 o0, o1, o2, o3, o4; \n\t" ".reg .u32 o0, o1, o2, o3, o4; \n\t"
".reg .u32 o5, o6, o7, i8, i9; \n\t" ".reg .u32 o5, o6, o7, i8, i9; \n\t"
".reg .u32 i10, i11, i12, i13; \n\t" ".reg .u32 i10, i11, i12, i13; \n\t"
".reg .u32 i14, i15, i16, i17; \n\t" ".reg .u32 i14, i15, i16, i17; \n\t"
".reg .u32 i18, i19, i20, i21; \n\t" ".reg .u32 i18, i19, i20, i21; \n\t"
".reg .u32 i22, i23; \n\t" ".reg .u32 i22, i23; \n\t"
"mov.b64 { i8, i9}, %4; \n\t" "mov.b64 { i8, i9}, %4; \n\t"
"mov.b64 {i10,i11}, %5; \n\t" "mov.b64 {i10,i11}, %5; \n\t"
"mov.b64 {i12,i13}, %6; \n\t" "mov.b64 {i12,i13}, %6; \n\t"
"mov.b64 {i14,i15}, %7; \n\t" "mov.b64 {i14,i15}, %7; \n\t"
"mov.b64 {i16,i17}, %8; \n\t" "mov.b64 {i16,i17}, %8; \n\t"
"mov.b64 {i18,i19}, %9; \n\t" "mov.b64 {i18,i19}, %9; \n\t"
"mov.b64 {i20,i21},%10; \n\t" "mov.b64 {i20,i21},%10; \n\t"
"mov.b64 {i22,i23},%11; \n\t" "mov.b64 {i22,i23},%11; \n\t"
"mul.lo.u32 o0, i8, i16; \n\t" "mul.lo.u32 o0, i8, i16; \n\t"
"mul.hi.u32 o1, i8, i16; \n\t" "mul.hi.u32 o1, i8, i16; \n\t"
"mad.lo.cc.u32 o1, i8, i17, o1;\n\t" "mad.lo.cc.u32 o1, i8, i17, o1;\n\t"
"madc.hi.u32 o2, i8, i17, 0;\n\t" "madc.hi.u32 o2, i8, i17, 0;\n\t"
"mad.lo.cc.u32 o1, i9, i16, o1;\n\t" "mad.lo.cc.u32 o1, i9, i16, o1;\n\t"
"madc.hi.cc.u32 o2, i9, i16, o2;\n\t" "madc.hi.cc.u32 o2, i9, i16, o2;\n\t"
"madc.hi.u32 o3, i8, i18, 0;\n\t" "madc.hi.u32 o3, i8, i18, 0;\n\t"
"mad.lo.cc.u32 o2, i8, i18, o2;\n\t" "mad.lo.cc.u32 o2, i8, i18, o2;\n\t"
"madc.hi.cc.u32 o3, i9, i17, o3;\n\t" "madc.hi.cc.u32 o3, i9, i17, o3;\n\t"
"madc.hi.u32 o4, i8, i19, 0;\n\t" "madc.hi.u32 o4, i8, i19, 0;\n\t"
"mad.lo.cc.u32 o2, i9, i17, o2;\n\t" "mad.lo.cc.u32 o2, i9, i17, o2;\n\t"
"madc.hi.cc.u32 o3, i10, i16, o3;\n\t" "madc.hi.cc.u32 o3, i10, i16, o3;\n\t"
"madc.hi.cc.u32 o4, i9, i18, o4;\n\t" "madc.hi.cc.u32 o4, i9, i18, o4;\n\t"
"addc.u32 o5, 0, 0;\n\t" "addc.u32 o5, 0, 0;\n\t"
"mad.lo.cc.u32 o2, i10, i16, o2;\n\t" "mad.lo.cc.u32 o2, i10, i16, o2;\n\t"
"madc.lo.cc.u32 o3, i8, i19, o3;\n\t" "madc.lo.cc.u32 o3, i8, i19, o3;\n\t"
"madc.hi.cc.u32 o4, i10, i17, o4;\n\t" "madc.hi.cc.u32 o4, i10, i17, o4;\n\t"
"madc.hi.cc.u32 o5, i9, i19, o5;\n\t" "madc.hi.cc.u32 o5, i9, i19, o5;\n\t"
"addc.u32 o6, 0, 0;\n\t" "addc.u32 o6, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i9, i18, o3;\n\t" "mad.lo.cc.u32 o3, i9, i18, o3;\n\t"
"madc.hi.cc.u32 o4, i11, i16, o4;\n\t" "madc.hi.cc.u32 o4, i11, i16, o4;\n\t"
"madc.hi.cc.u32 o5, i10, i18, o5;\n\t" "madc.hi.cc.u32 o5, i10, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t" "addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o3, i10, i17, o3;\n\t" "mad.lo.cc.u32 o3, i10, i17, o3;\n\t"
"addc.u32 o4, 0, o4;\n\t" "addc.u32 o4, 0, o4;\n\t"
"mad.hi.cc.u32 o5, i11, i17, o5;\n\t" "mad.hi.cc.u32 o5, i11, i17, o5;\n\t"
"madc.hi.cc.u32 o6, i10, i19, o6;\n\t" "madc.hi.cc.u32 o6, i10, i19, o6;\n\t"
"addc.u32 o7, 0, 0;\n\t" "addc.u32 o7, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i11, i16, o3;\n\t" "mad.lo.cc.u32 o3, i11, i16, o3;\n\t"
"madc.lo.cc.u32 o4, i9, i19, o4;\n\t" "madc.lo.cc.u32 o4, i9, i19, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t" "addc.u32 o5, 0, o5;\n\t"
"mad.hi.cc.u32 o6, i11, i18, o6;\n\t" "mad.hi.cc.u32 o6, i11, i18, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t" "addc.u32 o7, 0, o7;\n\t"
"mad.lo.cc.u32 o4, i10, i18, o4;\n\t" "mad.lo.cc.u32 o4, i10, i18, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t" "addc.u32 o5, 0, o5;\n\t"
"mad.hi.u32 o7, i11, i19, o7;\n\t" "mad.hi.u32 o7, i11, i19, o7;\n\t"
"mad.lo.cc.u32 o4, i11, i17, o4;\n\t" "mad.lo.cc.u32 o4, i11, i17, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t" "addc.u32 o5, 0, o5;\n\t"
"mad.lo.cc.u32 o5, i10, i19, o5;\n\t" "mad.lo.cc.u32 o5, i10, i19, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t" "addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o5, i11, i18, o5;\n\t" "mad.lo.cc.u32 o5, i11, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t" "addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o6, i11, i19, o6;\n\t" "mad.lo.cc.u32 o6, i11, i19, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t" "addc.u32 o7, 0, o7;\n\t"
"mov.b64 %0, {o0,o1}; \n\t" "mov.b64 %0, {o0,o1}; \n\t"
"mov.b64 %1, {o2,o3}; \n\t" "mov.b64 %1, {o2,o3}; \n\t"
"mov.b64 %2, {o4,o5}; \n\t" "mov.b64 %2, {o4,o5}; \n\t"
"mov.b64 %3, {o6,o7}; \n\t" "mov.b64 %3, {o6,o7}; \n\t"
"}" "}"
: "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high) : "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high)
: "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL), : "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL),
"l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL)); "l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL));
} }
#if 0 #if 0
@ -187,55 +188,60 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
#endif #endif
__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){ __device__ __forceinline__
t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
t4_t ret; t4_t ret;
ret.high = g[(idx*2 + 1)*threads + thread]; ret.high = g[(idx*2 + 1)*threads + thread];
ret.low = g[(idx*2)*threads + thread]; ret.low = g[(idx*2)*threads + thread];
if(thread==0){ if(thread==0){
// cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low); // cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low);
} }
return ret; return ret;
} }
__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){ __device__ __forceinline__
void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
g[(idx*2 + 1)*threads + thread]=val.high; g[(idx*2 + 1)*threads + thread]=val.high;
g[(idx*2)*threads + thread]=val.low; g[(idx*2)*threads + thread]=val.low;
if(thread==0){ if(thread==0){
// cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low); // cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low);
} }
} }
__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){ __device__ __forceinline__
void T4_set(t4_t *d, uint64_t v){
d->high = 0; d->high = 0;
d->low = v; d->low = v;
} }
__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){ __device__ __forceinline__
t4_t T4_add(t4_t a, t4_t b){
t4_t ret; t4_t ret;
uint32_t c=0; uint32_t c=0;
ret.low = a.low + b.low; ret.low = a.low + b.low;
if(ret.low < a.low) if(ret.low < a.low)
c=1; c=1;
ret.high = a.high + b.high + c; ret.high = a.high + b.high + c;
return ret; return ret;
} }
__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){ __device__ __forceinline__
t4_t T4_add(uint64_t a, t4_t b){
t4_t ret; t4_t ret;
uint32_t c=0; uint32_t c=0;
ret.low = a + b.low; ret.low = a + b.low;
if(ret.low < a) if(ret.low < a)
c=1; c=1;
ret.high = b.high + c; ret.high = b.high + c;
return ret; return ret;
} }
__device__ __forceinline__
__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ uint32_t T4_lt(t4_t a, t4_t b){
if(a.high < b.high) if(a.high < b.high)
return 1; return 1;
if(a.high == b.high && a.low < b.low) if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
return 0; return 0;
} }
__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ __device__ __forceinline__
uint32_t T4_gt(t4_t a, uint64_t b){
if(a.high) if(a.high)
return 1; return 1;
if(a.low > b) if(a.low > b)
@ -252,217 +259,213 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
} }
__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){ __device__
t4_t ul, cl, hpl, lpl; void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
uint32_t i; t4_t ul, cl, hpl, lpl;
T4_set(&cl,0); uint32_t i;
for(i=0; i < len; i++) { T4_set(&cl,0);
ul = T4(thread,threads,i,g_v); for(i=0; i < len; i++) {
umul_ppmmT4 (&hpl, &lpl, ul, sml); ul = T4(thread,threads,i,g_v);
umul_ppmmT4 (&hpl, &lpl, ul, sml);
lpl = T4_add(lpl,cl); lpl = T4_add(lpl,cl);
cl = T4_add(T4_lt(lpl,cl),hpl); cl = T4_add(T4_lt(lpl,cl),hpl);
T4_store(thread,threads,i,g_p,lpl); T4_store(thread,threads,i,g_p,lpl);
} }
T4_store(thread,threads,len,g_p,cl); T4_store(thread,threads,len,g_p,cl);
*size = len + T4_gt(cl,0); *size = len + T4_gt(cl,0);
} }
__device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, uint64_t sml, uint32_t *size){ __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, uint64_t sml, uint32_t *size){
uint64_t ul, cl, hpl, lpl; uint64_t ul, cl, hpl, lpl;
uint32_t i; uint32_t i;
cl = 0; cl = 0;
for(i=0; i < len; i++) { for(i=0; i < len; i++) {
ul = g_v[i*threads + thread]; ul = g_v[i*threads + thread];
umul_ppmm (hpl, lpl, ul, sml); umul_ppmm (hpl, lpl, ul, sml);
lpl += cl; lpl += cl;
cl = (lpl < cl) + hpl; cl = (lpl < cl) + hpl;
g_p[i*threads + thread] = lpl; g_p[i*threads + thread] = lpl;
} }
g_p[len*threads + thread] = cl; g_p[len*threads + thread] = cl;
*size = len + (cl != 0); *size = len + (cl != 0);
} }
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
{
uint64_t carry=0; uint64_t carry=0;
uint32_t i; uint32_t i;
uint64_t ul,lpl,hpl,rl; uint64_t ul,lpl,hpl,rl;
for(i=0; i < xsz; i++){ for(i=0; i < xsz; i++)
{
ul = x[i*threads + thread];
umul_ppmm (hpl, lpl, ul, a);
ul = x[i*threads + thread]; lpl += carry;
umul_ppmm (hpl, lpl, ul, a); carry = (lpl < carry) + hpl;
lpl += carry; rl = sum[(i+sofst) * threads + thread];
carry = (lpl < carry) + hpl; lpl = rl + lpl;
carry += lpl < rl;
rl = sum[(i+sofst) * threads + thread]; sum[(i+sofst)*threads + thread] = lpl;
lpl = rl + lpl; }
carry += lpl < rl;
sum[(i+sofst)*threads + thread] = lpl;
}
return carry; return carry;
} }
t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){ __device__
t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
{
t4_t carry; t4_t carry;
uint32_t i; uint32_t i;
t4_t ul,lpl,hpl,rl; t4_t ul,lpl,hpl,rl;
T4_set(&carry,0); T4_set(&carry,0);
for(i=0; i < xsz; i++){ for(i=0; i < xsz; i++)
{
ul = T4(thread,threads,i,x); ul = T4(thread,threads,i,x);
umul_ppmmT4 (&hpl, &lpl, ul, a); umul_ppmmT4 (&hpl, &lpl, ul, a);
lpl = T4_add(lpl,carry); lpl = T4_add(lpl,carry);
carry = T4_add(T4_lt(lpl,carry), hpl); carry = T4_add(T4_lt(lpl,carry), hpl);
rl = T4(thread,threads,i+sofst,sum); rl = T4(thread,threads,i+sofst,sum);
lpl = T4_add(rl,lpl); lpl = T4_add(rl,lpl);
carry = T4_add(T4_lt(lpl,rl),carry); carry = T4_add(T4_lt(lpl,rl),carry);
T4_store(thread,threads,i+sofst,sum,lpl); T4_store(thread,threads,i+sofst,sum,lpl);
} }
return carry; return carry;
} }
__global__
void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
if(ulegs < vlegs){ if(ulegs < vlegs) {
uint64_t t1=ulegs; uint64_t t1=ulegs;
ulegs = vlegs; ulegs = vlegs;
vlegs = t1; vlegs = t1;
uint64_t *t2 = g_u; uint64_t *t2 = g_u;
g_u = g_v; g_u = g_v;
g_v = t2; g_v = t2;
} }
uint32_t vofst=1,rofst=1,psize=0; uint32_t vofst=1,rofst=1,psize=0;
mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize); mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
#if 1 #if 1
while (vofst < vlegs) {
while (vofst < vlegs) {
//clear high word //TODO: right //clear high word //TODO: right
// printf("Size: %d\n", rp->size[tid]); // printf("Size: %d\n", rp->size[tid]);
g_p[(psize+0)*threads+thread] = 0; g_p[(psize+0)*threads + thread] = 0;
g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]);
vofst++; rofst++;
psize++;
}
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0) g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]);
// rp->size[tid]++;
vofst++; rofst++;
psize++;
}
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
// rp->size[tid]++;
#endif #endif
} }
} }
__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) __global__
void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
if(ulegs < vlegs) { // everything written the other way around... are you kidding me ?!
if(ulegs < vlegs){ ///everything written the other way around... are you kidding me ?! uint64_t t1=ulegs;
uint64_t t1=ulegs; ulegs = vlegs;
ulegs = vlegs; vlegs = t1;
vlegs = t1;
uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
ulegs >>= 1; vlegs >>= 1; uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
if(thread == 0){ ulegs >>= 1; vlegs >>= 1;
// cuPrintf("U: %d V: %d\n", ulegs, vlegs);
}
if(thread == 0) {
// cuPrintf("U: %d V: %d\n", ulegs, vlegs);
}
uint32_t vofst=1,rofst=1,psize=0;
uint32_t vofst=1,rofst=1,psize=0; mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
#if 1 #if 1
t4_t zero; t4_t zero;
T4_set(&zero,0); T4_set(&zero,0);
// while (vofst < vlegs) {
// while (vofst < vlegs) { #pragma unroll
for (vofst=1;vofst<vlegs;vofst++)
#pragma unroll {
for (vofst=1;vofst<vlegs;vofst++) { T4_store(thread,threads,psize,g_p,zero);
T4_store(thread,threads,psize,g_p,zero); T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
// vofst++;
T4_store(thread,threads,ulegs+rofst,g_p,addmul_1gT4 (thread, threads, g_p ,rofst , g_u, ulegs,T4(thread,threads,vofst,g_v)));
// vofst++;
rofst++; rofst++;
psize++; psize++;
} }
#endif #endif
} }
} }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{ {
if (code != cudaSuccess) if (code != cudaSuccess)
{ {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code); if (abort) exit(code);
} }
} }
__host__
__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order) void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
{ {
const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size =0; size_t shared_size = 0;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
} }
__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order) __host__
void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
{ {
const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's) const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size =0; size_t shared_size = 0;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ; gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
} }
__host__ void mul_init(){ __host__
void mul_init()
{
} }

172
m7/cuda_ripemd160.cu

@ -48,7 +48,7 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
static __constant__ uint32_t gpu_IV[5]; static __constant__ uint32_t gpu_IV[5];
static __constant__ uint32_t bufo[5]; static __constant__ uint32_t bufo[5];
static const uint32_t IV[5] = { static const uint32_t IV[5] = {
@ -282,118 +282,116 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \ (h)[0] = tmp; \
} }
__global__
__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
#pragma unroll 16
for (int i=0;i<16;i++) {
if ((i+16) < 29)
in2[i] = c_PaddedMessage80[i+16];
else if ((i+16)==29)
in2[i] = nounce;
else if ((i+16)==30)
in2[i] = c_PaddedMessage80[i+16];
else
in2[i] = 0;
}
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread ;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
// #pragma unroll 16
// for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
#pragma unroll 16
for (int i=0;i<16;i++) {if ((i+16)<29) {in2[i]= c_PaddedMessage80[i+16];}
else if ((i+16)==29) {in2[i]= nounce;}
else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];}
else {in2[i]= 0;}}
#pragma unroll 16 #pragma unroll 16
for (int i=0;i<16;i++) {in3[i]=0;} for (int i=0;i<16;i++)
in3[14]=0x3d0; in3[i]=0;
// #pragma unroll 5 in3[14]=0x3d0;
// for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
#pragma unroll 5 #pragma unroll 5
for (int i=0;i<5;i++) {buf[i]=bufo[i];} for (int i=0;i<5;i++)
// RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) buf[i]=bufo[i];
RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf); RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf);
hash.h4[5]=0; hash.h4[5]=0;
#pragma unroll 5 #pragma unroll 5
for (int i=0;i<5;i++) for (int i=0; i<5; i++)
{hash.h4[i]=buf[i]; hash.h4[i]=buf[i];
}
//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; #pragma unroll 3
//#pragma unroll 3 for (int i=0;i<3;i++) {
//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];} outputHash[i*threads+thread] = hash.h8[i];
#pragma unroll 3 }
for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];} }
//#pragma unroll 8
//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
}
} }
void ripemd160_cpu_init(int thr_id, int threads) void ripemd160_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
} }
__host__ void ripemd160_setBlock_120(void *pdata) __host__
void ripemd160_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x80; uint8_t ending =0x80;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
uint32_t* alt_data =(uint32_t*)pdata; uint32_t* alt_data =(uint32_t*)pdata;
uint32_t in[16],buf[5]; uint32_t in[16],buf[5];
for (int i=0;i<16;i++)
in[i]= alt_data[i];
for (int i=0;i<16;i++) {in[i]= alt_data[i];} for (int i=0;i<5;i++)
buf[i]=IV[i];
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
for (int i=0;i<5;i++) {buf[i]=IV[i];}
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }
__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) __host__
void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 256;
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0; size_t shared_size =0;
m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);

141
m7/cuda_tiger192.cu

@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__constant__ uint64_t bufo[3]; __constant__ uint64_t bufo[3];
static __constant__ uint64_t gpu_III[3]; static __constant__ uint64_t gpu_III[3];
static __constant__ uint64_t T1[256]; static __constant__ uint64_t T1[256];
static __constant__ uint64_t T2[256]; static __constant__ uint64_t T2[256];
static __constant__ uint64_t T3[256]; static __constant__ uint64_t T3[256];
static __constant__ uint64_t T4[256]; static __constant__ uint64_t T4[256];
static const uint64_t III[3] = { static const uint64_t III[3] = {
SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187) SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
}; };
@ -583,16 +585,16 @@ static const uint64_t cpu_T4[256] = {
SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F) SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F)
}; };
#define PASS(a, b, c, mul) { \ #define PASS(a, b, c, mul) { \
ROUND(a, b, c, X0, mul); \ ROUND(a, b, c, X0, mul); \
ROUND(b, c, a, X1, mul); \ ROUND(b, c, a, X1, mul); \
ROUND(c, a, b, X2, mul); \ ROUND(c, a, b, X2, mul); \
ROUND(a, b, c, X3, mul); \ ROUND(a, b, c, X3, mul); \
ROUND(b, c, a, X4, mul); \ ROUND(b, c, a, X4, mul); \
ROUND(c, a, b, X5, mul); \ ROUND(c, a, b, X5, mul); \
ROUND(a, b, c, X6, mul); \ ROUND(a, b, c, X6, mul); \
ROUND(b, c, a, X7, mul); \ ROUND(b, c, a, X7, mul); \
} }
#define MUL5(x) SPH_T64((x) * SPH_C64(5)) #define MUL5(x) SPH_T64((x) * SPH_C64(5))
#define MUL7(x) SPH_T64((x) * SPH_C64(7)) #define MUL7(x) SPH_T64((x) * SPH_C64(7))
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
(r)[2] = SPH_T64(C + (r)[2]); \ (r)[2] = SPH_T64(C + (r)[2]); \
} }
__global__
__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
__shared__ uint64_t sharedMem[1024];
__shared__ uint64_t sharedMem[1024]; if(threadIdx.x < 256) {
if(threadIdx.x < 256)
{
sharedMem[threadIdx.x] = T1[threadIdx.x]; sharedMem[threadIdx.x] = T1[threadIdx.x];
sharedMem[threadIdx.x+256] = T2[threadIdx.x]; sharedMem[threadIdx.x+256] = T2[threadIdx.x];
sharedMem[threadIdx.x+512] = T3[threadIdx.x]; sharedMem[threadIdx.x+512] = T3[threadIdx.x];
sharedMem[threadIdx.x+768] = T4[threadIdx.x]; sharedMem[threadIdx.x+768] = T4[threadIdx.x];
} }
__syncthreads(); __syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
/* /*
#undef MUL5 #undef MUL5
#undef MUL7 #undef MUL7
@ -680,7 +677,7 @@ uint64_t h8[8];
#define MUL7(x) mul(x,7) #define MUL7(x) mul(x,7)
#define MUL9(x) mul(x,9) #define MUL9(x) mul(x,9)
*/ */
#define PASS(a, b, c, mul) { \ #define PASS(a, b, c, mul) { \
ROUND(a, b, c, X0, mul); \ ROUND(a, b, c, X0, mul); \
ROUND(b, c, a, X1, mul); \ ROUND(b, c, a, X1, mul); \
ROUND(c, a, b, X2, mul); \ ROUND(c, a, b, X2, mul); \
@ -691,89 +688,95 @@ uint64_t h8[8];
ROUND(b, c, a, X7, mul); \ ROUND(b, c, a, X7, mul); \
} }
#define ROUND(a, b, c, x, mul) { \
#define ROUND(a, b, c, x, mul) { \
c ^= x; \ c ^= x; \
a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \ a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \ ^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \
b = SPH_T64(b + (sharedMem[((c >> 8) & 0xFF)+768] ^ sharedMem[((c >> 24) & 0xFF)+512] \ b = SPH_T64(b + (sharedMem[((c >> 8) & 0xFF)+768] ^ sharedMem[((c >> 24) & 0xFF)+512] \
^ sharedMem[((c >> 40) & 0xFF)+256] ^ sharedMem[(c >> 56) & 0xFF])); \ ^ sharedMem[((c >> 40) & 0xFF)+256] ^ sharedMem[(c >> 56) & 0xFF])); \
b = mul(b); \ b = mul(b); \
} }
uint64_t in2[8];
uint64_t in[8],buf[3];
uint64_t in2[8],in3[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];} for (int i=0; i<8; i++)
in2[i] = c_PaddedMessage80[i+8];
uint32_t* Mess = (uint32_t*)in2; uint32_t* Mess = (uint32_t*)in2;
Mess[13]=nounce; Mess[13] = nounce;
uint64_t in3[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {in3[i]=0;} for (int i=0; i<8; i++)
in3[i]=0;
in3[7]=0x3d0; in3[7]=0x3d0;
#pragma unroll 3
for (int i=0;i<3;i++) {buf[i]=bufo[i];} uint64_t buf[3];
#pragma unroll 3
for (int i=0; i<3; i++)
buf[i]=bufo[i];
TIGER_ROUND_BODY(in2, buf); TIGER_ROUND_BODY(in2, buf);
TIGER_ROUND_BODY(in3, buf); TIGER_ROUND_BODY(in3, buf);
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}} for (int i=0;i<8;i++) {
} //// threads if (i<3) {
outputHash[i*threads+thread] = buf[i];
} else {
outputHash[i*threads+thread] = 0;
}
}
} // thread
} }
__host__
void tiger192_cpu_init(int thr_id, int threads) void tiger192_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
} }
__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) __host__
void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 640; // 256
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN size_t shared_size = 0;
// const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0;
m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__
__host__ void tiger192_setBlock_120(void *pdata) void tiger192_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x01; uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
#undef ROUND #undef ROUND
#undef MUL5 #undef MUL5
#undef MUL7 #undef MUL7
#undef MUL9 #undef MUL9
#define MUL5(x) ((x) * SPH_C64(5)) #define MUL5(x) ((x) * SPH_C64(5))
#define MUL7(x) ((x) * SPH_C64(7)) #define MUL7(x) ((x) * SPH_C64(7))
#define MUL9(x) ((x) * SPH_C64(9)) #define MUL9(x) ((x) * SPH_C64(9))
#define ROUND(a, b, c, x, mul) { \ #define ROUND(a, b, c, x, mul) { \
c ^= x; \ c ^= x; \
a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \ a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \
^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \ ^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
b = mul(b); \ b = mul(b); \
} }
uint64_t* alt_data = (uint64_t*) pdata; uint64_t* alt_data = (uint64_t*) pdata;
uint64_t in[8],buf[3]; uint64_t in[8],buf[3];
for (int i=0;i<8;i++) {in[i]= alt_data[i];}
for (int i=0;i<3;i++) {buf[i]=III[i];} for (int i=0;i<8;i++)
in[i] = alt_data[i];
TIGER_ROUND_BODY(in, buf) for (int i=0;i<3;i++)
cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); buf[i] = III[i];
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }

40
m7/m7.cu

@ -22,7 +22,7 @@ extern "C"
extern int device_map[8]; extern int device_map[8];
extern bool opt_benchmark; extern bool opt_benchmark;
static uint64_t *d_hash[8]; //static uint64_t *d_hash[8];
static uint64_t *FinalHash[8]; static uint64_t *FinalHash[8];
static uint64_t *KeccakH[8]; static uint64_t *KeccakH[8];
static uint64_t *WhirlpoolH[8]; static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
{ {
// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool) // sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
char data_str[245], hash_str[65], target_str[65];
uint8_t *bdata = 0; uint8_t *bdata = 0;
mpz_t bns[7]; mpz_t bns[7];
mpz_t product; mpz_t product;
int rc = 0;
for(int i=0; i < 7; i++) { for(int i=0; i < 7; i++) {
mpz_init(bns[i]); mpz_init(bns[i]);
@ -292,44 +290,42 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
tiger192_setBlock_120((void*)pdata); tiger192_setBlock_120((void*)pdata);
cuda_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
uint32_t TheNonce = pdata[29];
do { do {
int order = 0; int order = 0;
uint32_t foundNonce; uint32_t foundNonce;
m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++); m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);
m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++); m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);
m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++); m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);
m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++); m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++);
m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++); m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++);
m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++); m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);
m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++); m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);
cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64 cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128 cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96 cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112 cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
MyStreamSynchronize(0,order++,thr_id);
foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order); foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)

54
m7/m7_keccak512.cu

@ -5,6 +5,8 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
static __constant__ uint64_t stateo[25]; static __constant__ uint64_t stateo[25];
static __constant__ uint64_t RC[24]; static __constant__ uint64_t RC[24];
static const uint64_t cpu_RC[24] = { static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
0x0000000080000001ull, 0x8000000080008008ull 0x0000000080000001ull, 0x8000000080008008ull
}; };
static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { __device__ __forceinline__
static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i; size_t i;
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
} }
} }
__host__ __forceinline__
static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) { static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i; size_t i;
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
@ -204,25 +209,18 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
} }
} }
__global__ /* __launch_bounds__(256, 2) */
void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
uint64_t state[25]; uint64_t state[25];
#pragma unroll 16 #pragma unroll 16
for (int i=9;i<25;i++) {state[i]=stateo[i];} for (int i=9;i<25;i++) {state[i]=stateo[i];}
state[0] = xor1(stateo[0],c_PaddedMessage80[9]); state[0] = xor1(stateo[0],c_PaddedMessage80[9]);
state[1] = xor1(stateo[1],c_PaddedMessage80[10]); state[1] = xor1(stateo[1],c_PaddedMessage80[10]);
@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
keccak_block(state,RC); keccak_block(state,RC);
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];} for (int i=0;i<8;i++) {
outputHash[i*threads+thread] = state[i];
}
} //thread } //thread
} }
void m7_keccak512_cpu_init(int thr_id, int threads) void m7_keccak512_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
} }
__host__ void m7_keccak512_setBlock_120(void *pdata) __host__ void m7_keccak512_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x01; uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); memset(PaddedMessage+123, 0, 5);
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
uint64_t* alt_data = (uint64_t*) pdata; uint64_t* alt_data = (uint64_t*) pdata;
uint64_t state[25]; uint64_t state[25];
for(int i=0;i<25;i++) {state[i]=0;} for(int i=0;i<9;i++)
state[i] = alt_data[i];
for(int i=10;i<25;i++)
for (int i=0;i<9;i++) {state[i] ^= alt_data[i];} state[i] = 0;
keccak_block_host(state,cpu_RC); keccak_block_host(state,cpu_RC);
cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }

Loading…
Cancel
Save