Browse Source

m7: forgot to indent some cuda files

and remove unused variables...
Tanguy Pruvot 10 years ago
parent
commit
fcd381cda2
  1. 320
      m7/cuda_m7_sha256.cu
  2. 101
      m7/cuda_mul2.cu
  3. 140
      m7/cuda_ripemd160.cu
  4. 111
      m7/cuda_tiger192.cu
  5. 6
      m7/m7.cu
  6. 42
      m7/m7_keccak512.cu

320
m7/cuda_m7_sha256.cu

@ -1,10 +1,10 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include "sph/sph_types.h" #include "sph/sph_types.h"
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//#define SPH_C64(x) ((uint64_t)(x ## ULL)) //#define SPH_C64(x) ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
}; };
static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x) __device__ __forceinline__
static uint32_t bsg2_0(uint32_t x)
{ {
uint32_t r1 = SPH_ROTR32(x,2); uint32_t r1 = SPH_ROTR32(x,2);
uint32_t r2 = SPH_ROTR32(x,13); uint32_t r2 = SPH_ROTR32(x,13);
uint32_t r3 = SPH_ROTR32(x,22); uint32_t r3 = SPH_ROTR32(x,22);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_1(uint32_t x)
{ {
uint32_t r1 = SPH_ROTR32(x,6); uint32_t r1 = SPH_ROTR32(x,6);
uint32_t r2 = SPH_ROTR32(x,11); uint32_t r2 = SPH_ROTR32(x,11);
uint32_t r3 = SPH_ROTR32(x,25); uint32_t r3 = SPH_ROTR32(x,25);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_0(uint32_t x)
{ {
uint64_t r1 = SPH_ROTR32(x,7); uint64_t r1 = SPH_ROTR32(x,7);
uint64_t r2 = SPH_ROTR32(x,18); uint64_t r2 = SPH_ROTR32(x,18);
uint64_t r3 = shr_t32(x,3); uint64_t r3 = shr_t32(x,3);
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_1(uint32_t x)
{ {
uint64_t r1 = SPH_ROTR32(x,17); uint64_t r1 = SPH_ROTR32(x,17);
uint64_t r2 = SPH_ROTR32(x,19); uint64_t r2 = SPH_ROTR32(x,19);
@ -96,104 +103,101 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
return xor3b(r1,r2,r3); return xor3b(r1,r2,r3);
} }
static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __device__ __forceinline__
static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared) uint32_t in,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g); uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e); uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a); uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c); uint32_t andorv =andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in; t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv; t2 = bsg20 + andorv;
d = d + t1; d = d + t1;
h = t1 + t2; h = t1 + t2;
} }
static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __host__ __forceinline__
static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared) uint32_t in,const uint32_t Kshared)
{ {
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t t1,t2; uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); t1 = h + bsg21 + vxandx + Kshared + in;
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); t2 = bsg20 + andorv;
d = d + t1;
t1 = h + bsg21 + vxandx + Kshared + in; h = t1 + t2;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
} }
static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __device__ __forceinline__
static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared) uint32_t* in,uint32_t pc,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF; int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF; int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF; int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc]; uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1]; uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2]; uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3]; uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1); uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3); uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g); uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e); uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a); uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c); uint32_t andorv =andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0; in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc]; t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv; t2 = bsg20 + andorv;
d = d + t1; d = d + t1;
h = t1 + t2; h = t1 + t2;
} }
static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, __host__ __forceinline__
static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared) uint32_t* in,uint32_t pc,const uint32_t Kshared)
{ {
uint32_t t1,t2; uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF; int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF; int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF; int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc]; uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1]; uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2]; uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3]; uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t1 = h + bsg21 + vxandx + Kshared + in[pc]; t2 = bsg20 + andorv;
t2 = bsg20 + andorv; d = d + t1;
d = d + t1; h = t1 + t2;
h = t1 + t2;
} }
static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) __device__ __forceinline__
static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{ {
uint32_t a=r[0]; uint32_t a=r[0];
uint32_t b=r[1]; uint32_t b=r[1];
uint32_t c=r[2]; uint32_t c=r[2];
@ -220,9 +224,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]); sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3 #pragma unroll 3
for (int i=0;i<3;i++) { for (int i=0;i<3;i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,11 +243,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
} }
r[0] = r[0] + a; r[0] = r[0] + a;
r[1] = r[1] + b; r[1] = r[1] + b;
r[2] = r[2] + c; r[2] = r[2] + c;
@ -254,10 +255,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
r[7] = r[7] + h; r[7] = r[7] + h;
} }
static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) __forceinline__
static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{ {
uint32_t a=r[0]; uint32_t a=r[0];
uint32_t b=r[1]; uint32_t b=r[1];
uint32_t c=r[2]; uint32_t c=r[2];
@ -284,9 +284,9 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]); sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3 #pragma unroll 3
for (int i=0;i<3;i++) { for (int i=0;i<3;i++)
{
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,7 +303,6 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
} }
r[0] = r[0] + a; r[0] = r[0] + a;
@ -316,27 +315,12 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
r[7] = r[7] + h; r[7] = r[7] + h;
} }
__global__
__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
//uint32_t buf[8];
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread ; // original implementation uint32_t nounce = startNounce + thread ; // original implementation
uint32_t buf[8]; uint32_t buf[8];
@ -344,56 +328,36 @@ uint64_t h8[8];
uint32_t in3[16]={0}; uint32_t in3[16]={0};
#pragma unroll 13 #pragma unroll 13
for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);} for (int i=0; i<13; i++)
in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
in2[13]=cuda_swab32(nounce); in2[13]=cuda_swab32(nounce);
in2[14]=cuda_swab32(c_PaddedMessage80[30]); in2[14]=cuda_swab32(c_PaddedMessage80[30]);
in3[15]=0x3d0; in3[15]=0x3d0;
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {buf[i]= pbuf[i];} for (int i=0; i<8; i++)
buf[i] = pbuf[i];
sha2_round_body(in2,buf,K); sha2_round_body(in2,buf,K);
sha2_round_body(in3,buf,K); sha2_round_body(in3,buf,K);
//#pragma unroll 8
//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
#pragma unroll 4
for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
//////////////////////////////////////////////////////////////////////////////////////////////////
} // threads
#pragma unroll 4
for (int i=0; i<4; i++) {
outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
}
} // thread
} }
__global__
__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
{ {
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
union {
uint8_t h1[304];
uint32_t h4[76];
uint64_t h8[38];
} hash;
uint32_t in[16],buf[8]; uint32_t in[16],buf[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);} for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
#pragma unroll 8 #pragma unroll 8
@ -415,96 +379,87 @@ uint64_t h8[38];
#pragma unroll 5 #pragma unroll 5
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);} for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
((uint64_t*)in)[5]= g_hash1[threads*(5+32)+thread];
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
in[11]=0; in[11]=0;
in[12]=0; in[12]=0;
in[13]=0; in[13]=0;
in[14]=0; in[14]=0;
in[15]=0x968; in[15]=0x968;
int it=0; int it=0;
do { do {
in[15]-=8; in[15]-=8;
it++; it++;
} while (((uint8_t*)in)[44-it]==0); } while (((uint8_t*)in)[44-it]==0);
((uint8_t*)in)[44-it+1]=0x80; ((uint8_t*)in)[44-it+1]=0x80;
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); ((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
sha2_round_body(in,buf,K); sha2_round_body(in,buf,K);
uint32_t nounce = startNounce +thread; uint32_t nounce = startNounce +thread;
bool rc = false; bool rc = false;
#pragma unroll 4 #pragma unroll 4
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
{ {
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) { if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;} if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
// if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} rc = true;
else
rc = false;
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
} }
} }
if (rc && resNounce[0] > nounce)
if(rc == true)
{
if(resNounce[0] > nounce)
resNounce[0] = nounce; resNounce[0] = nounce;
} // thread
}
////
} // threads
} }
__host__
void m7_sha256_cpu_init(int thr_id, int threads)
__host__ void m7_sha256_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t)); cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
} }
__host__
__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
{ {
const int threadsperblock = 384;
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size = 0; size_t shared_size = 0;
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]); m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
result = *d_mnounce[thr_id]; result = *d_mnounce[thr_id];
return result; return result;
} }
__host__
__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 512;
const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// dim3 grid(1);
// dim3 block(1);
size_t shared_size = 0; size_t shared_size = 0;
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful __host__
void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x80; uint8_t ending =0x80;

101
m7/cuda_mul2.cu

@ -52,7 +52,7 @@ typedef struct t4_t{
} t4_t; } t4_t;
__device__ __forceinline__ __device__ __forceinline__
ulonglong2 umul64wide (unsigned long long int a, ulonglong2 umul64wide(unsigned long long int a,
unsigned long long int b) unsigned long long int b)
{ {
ulonglong2 res; ulonglong2 res;
@ -85,7 +85,8 @@ ulonglong2 umul64wide (unsigned long long int a,
} }
__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n) __device__ __forceinline__
void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
{ {
asm ("{\n\t" asm ("{\n\t"
".reg .u32 o0, o1, o2, o3, o4; \n\t" ".reg .u32 o0, o1, o2, o3, o4; \n\t"
@ -187,7 +188,8 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
#endif #endif
__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){ __device__ __forceinline__
t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
t4_t ret; t4_t ret;
ret.high = g[(idx*2 + 1)*threads + thread]; ret.high = g[(idx*2 + 1)*threads + thread];
ret.low = g[(idx*2)*threads + thread]; ret.low = g[(idx*2)*threads + thread];
@ -199,7 +201,8 @@ __device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t i
return ret; return ret;
} }
__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){ __device__ __forceinline__
void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
g[(idx*2 + 1)*threads + thread]=val.high; g[(idx*2 + 1)*threads + thread]=val.high;
g[(idx*2)*threads + thread]=val.low; g[(idx*2)*threads + thread]=val.low;
@ -209,12 +212,14 @@ __device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint
} }
__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){ __device__ __forceinline__
void T4_set(t4_t *d, uint64_t v){
d->high = 0; d->high = 0;
d->low = v; d->low = v;
} }
__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){ __device__ __forceinline__
t4_t T4_add(t4_t a, t4_t b){
t4_t ret; t4_t ret;
uint32_t c=0; uint32_t c=0;
ret.low = a.low + b.low; ret.low = a.low + b.low;
@ -224,7 +229,8 @@ __device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
return ret; return ret;
} }
__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){ __device__ __forceinline__
t4_t T4_add(uint64_t a, t4_t b){
t4_t ret; t4_t ret;
uint32_t c=0; uint32_t c=0;
ret.low = a + b.low; ret.low = a + b.low;
@ -234,8 +240,8 @@ __device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
return ret; return ret;
} }
__device__ __forceinline__
__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ uint32_t T4_lt(t4_t a, t4_t b){
if(a.high < b.high) if(a.high < b.high)
return 1; return 1;
if(a.high == b.high && a.low < b.low) if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
return 0; return 0;
} }
__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ __device__ __forceinline__
uint32_t T4_gt(t4_t a, uint64_t b){
if(a.high) if(a.high)
return 1; return 1;
if(a.low > b) if(a.low > b)
@ -252,7 +259,8 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
} }
__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){ __device__
void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
t4_t ul, cl, hpl, lpl; t4_t ul, cl, hpl, lpl;
uint32_t i; uint32_t i;
T4_set(&cl,0); T4_set(&cl,0);
@ -289,13 +297,14 @@ __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint6
*size = len + (cl != 0); *size = len + (cl != 0);
} }
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
{
uint64_t carry=0; uint64_t carry=0;
uint32_t i; uint32_t i;
uint64_t ul,lpl,hpl,rl; uint64_t ul,lpl,hpl,rl;
for(i=0; i < xsz; i++){ for(i=0; i < xsz; i++)
{
ul = x[i*threads + thread]; ul = x[i*threads + thread];
umul_ppmm (hpl, lpl, ul, a); umul_ppmm (hpl, lpl, ul, a);
@ -311,13 +320,15 @@ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum,
return carry; return carry;
} }
t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){ __device__
t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
{
t4_t carry; t4_t carry;
uint32_t i; uint32_t i;
t4_t ul,lpl,hpl,rl; t4_t ul,lpl,hpl,rl;
T4_set(&carry,0); T4_set(&carry,0);
for(i=0; i < xsz; i++){ for(i=0; i < xsz; i++)
{
ul = T4(thread,threads,i,x); ul = T4(thread,threads,i,x);
umul_ppmmT4 (&hpl, &lpl, ul, a); umul_ppmmT4 (&hpl, &lpl, ul, a);
@ -333,14 +344,13 @@ t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, u
return carry; return carry;
} }
__global__
void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
if(ulegs < vlegs){ if(ulegs < vlegs) {
uint64_t t1=ulegs; uint64_t t1=ulegs;
ulegs = vlegs; ulegs = vlegs;
vlegs = t1; vlegs = t1;
@ -354,11 +364,10 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize); mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
#if 1 #if 1
while (vofst < vlegs) { while (vofst < vlegs) {
//clear high word //TODO: right //clear high word //TODO: right
// printf("Size: %d\n", rp->size[tid]); // printf("Size: %d\n", rp->size[tid]);
g_p[(psize+0)*threads+thread] = 0; g_p[(psize+0)*threads + thread] = 0;
g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]); g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]);
@ -368,19 +377,17 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0) // if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
// rp->size[tid]++; // rp->size[tid]++;
#endif #endif
} }
} }
__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p) __global__
void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
if(ulegs < vlegs) { // everything written the other way around... are you kidding me ?!
if(ulegs < vlegs){ ///everything written the other way around... are you kidding me ?!
uint64_t t1=ulegs; uint64_t t1=ulegs;
ulegs = vlegs; ulegs = vlegs;
vlegs = t1; vlegs = t1;
@ -392,12 +399,10 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
ulegs >>= 1; vlegs >>= 1; ulegs >>= 1; vlegs >>= 1;
if(thread == 0){ if(thread == 0) {
// cuPrintf("U: %d V: %d\n", ulegs, vlegs); // cuPrintf("U: %d V: %d\n", ulegs, vlegs);
} }
uint32_t vofst=1,rofst=1,psize=0; uint32_t vofst=1,rofst=1,psize=0;
mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize); mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
@ -405,21 +410,17 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
t4_t zero; t4_t zero;
T4_set(&zero,0); T4_set(&zero,0);
// while (vofst < vlegs) { // while (vofst < vlegs) {
#pragma unroll #pragma unroll
for (vofst=1;vofst<vlegs;vofst++) { for (vofst=1;vofst<vlegs;vofst++)
{
T4_store(thread,threads,psize,g_p,zero); T4_store(thread,threads,psize,g_p,zero);
T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
T4_store(thread,threads,ulegs+rofst,g_p,addmul_1gT4 (thread, threads, g_p ,rofst , g_u, ulegs,T4(thread,threads,vofst,g_v))); // vofst++;
// vofst++;
rofst++; rofst++;
psize++; psize++;
} }
#endif #endif
} }
} }
@ -434,35 +435,37 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
} }
} }
__host__
__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order) void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
{ {
const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size =0; size_t shared_size = 0;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
} }
__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order) __host__
void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
{ {
const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's) const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size =0; size_t shared_size = 0;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ; //gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ; gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
} }
__host__ void mul_init(){ __host__
void mul_init()
{
} }

140
m7/cuda_ripemd160.cu

@ -48,7 +48,7 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
static __constant__ uint32_t gpu_IV[5]; static __constant__ uint32_t gpu_IV[5];
static __constant__ uint32_t bufo[5]; static __constant__ uint32_t bufo[5];
static const uint32_t IV[5] = { static const uint32_t IV[5] = {
@ -282,118 +282,116 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \ (h)[0] = tmp; \
} }
__global__
__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t nounce = startNounce + thread ;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16]; uint32_t in2[16],in3[16];
uint32_t in[16],buf[5]; uint32_t in[16],buf[5];
// #pragma unroll 16
// for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
#pragma unroll 16 #pragma unroll 16
for (int i=0;i<16;i++) {if ((i+16)<29) {in2[i]= c_PaddedMessage80[i+16];} for (int i=0;i<16;i++) {
else if ((i+16)==29) {in2[i]= nounce;} if ((i+16) < 29)
else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];} in2[i] = c_PaddedMessage80[i+16];
else {in2[i]= 0;}} else if ((i+16)==29)
in2[i] = nounce;
else if ((i+16)==30)
in2[i] = c_PaddedMessage80[i+16];
else
in2[i] = 0;
}
#pragma unroll 16 #pragma unroll 16
for (int i=0;i<16;i++) {in3[i]=0;} for (int i=0;i<16;i++)
in3[i]=0;
in3[14]=0x3d0; in3[14]=0x3d0;
// #pragma unroll 5
// for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
#pragma unroll 5 #pragma unroll 5
for (int i=0;i<5;i++) {buf[i]=bufo[i];} for (int i=0;i<5;i++)
// RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) buf[i]=bufo[i];
RIPEMD160_ROUND_BODY(in2, buf); RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf); RIPEMD160_ROUND_BODY(in3, buf);
hash.h4[5]=0;
#pragma unroll 5
for (int i=0; i<5; i++)
hash.h4[i]=buf[i];
hash.h4[5]=0; #pragma unroll 3
#pragma unroll 5 for (int i=0;i<3;i++) {
for (int i=0;i<5;i++) outputHash[i*threads+thread] = hash.h8[i];
{hash.h4[i]=buf[i]; }
}
//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
//#pragma unroll 3
//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];}
#pragma unroll 3
for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];}
//#pragma unroll 8
//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
} }
} }
void ripemd160_cpu_init(int thr_id, int threads) void ripemd160_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
} }
__host__ void ripemd160_setBlock_120(void *pdata) __host__
void ripemd160_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x80; uint8_t ending =0x80;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
uint32_t* alt_data =(uint32_t*)pdata; uint32_t* alt_data =(uint32_t*)pdata;
uint32_t in[16],buf[5]; uint32_t in[16],buf[5];
for (int i=0;i<16;i++)
in[i]= alt_data[i];
for (int i=0;i<16;i++) {in[i]= alt_data[i];} for (int i=0;i<5;i++)
buf[i]=IV[i];
for (int i=0;i<5;i++) {buf[i]=IV[i];}
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved) RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }
__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) __host__
void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 256;
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0; size_t shared_size =0;
m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);

111
m7/cuda_tiger192.cu

@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__constant__ uint64_t bufo[3]; __constant__ uint64_t bufo[3];
static __constant__ uint64_t gpu_III[3]; static __constant__ uint64_t gpu_III[3];
static __constant__ uint64_t T1[256]; static __constant__ uint64_t T1[256];
static __constant__ uint64_t T2[256]; static __constant__ uint64_t T2[256];
static __constant__ uint64_t T3[256]; static __constant__ uint64_t T3[256];
static __constant__ uint64_t T4[256]; static __constant__ uint64_t T4[256];
static const uint64_t III[3] = { static const uint64_t III[3] = {
SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187) SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
}; };
@ -592,7 +594,7 @@ static const uint64_t cpu_T4[256] = {
ROUND(c, a, b, X5, mul); \ ROUND(c, a, b, X5, mul); \
ROUND(a, b, c, X6, mul); \ ROUND(a, b, c, X6, mul); \
ROUND(b, c, a, X7, mul); \ ROUND(b, c, a, X7, mul); \
} }
#define MUL5(x) SPH_T64((x) * SPH_C64(5)) #define MUL5(x) SPH_T64((x) * SPH_C64(5))
#define MUL7(x) SPH_T64((x) * SPH_C64(7)) #define MUL7(x) SPH_T64((x) * SPH_C64(7))
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
(r)[2] = SPH_T64(C + (r)[2]); \ (r)[2] = SPH_T64(C + (r)[2]); \
} }
__global__
__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
__shared__ uint64_t sharedMem[1024]; __shared__ uint64_t sharedMem[1024];
if(threadIdx.x < 256)
{ if(threadIdx.x < 256) {
sharedMem[threadIdx.x] = T1[threadIdx.x]; sharedMem[threadIdx.x] = T1[threadIdx.x];
sharedMem[threadIdx.x+256] = T2[threadIdx.x]; sharedMem[threadIdx.x+256] = T2[threadIdx.x];
sharedMem[threadIdx.x+512] = T3[threadIdx.x]; sharedMem[threadIdx.x+512] = T3[threadIdx.x];
sharedMem[threadIdx.x+768] = T4[threadIdx.x]; sharedMem[threadIdx.x+768] = T4[threadIdx.x];
} }
__syncthreads(); __syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
/* /*
#undef MUL5 #undef MUL5
#undef MUL7 #undef MUL7
@ -691,8 +688,6 @@ uint64_t h8[8];
ROUND(b, c, a, X7, mul); \ ROUND(b, c, a, X7, mul); \
} }
#define ROUND(a, b, c, x, mul) { \ #define ROUND(a, b, c, x, mul) { \
c ^= x; \ c ^= x; \
a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \ a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
@ -702,78 +697,86 @@ uint64_t h8[8];
b = mul(b); \ b = mul(b); \
} }
uint64_t in2[8];
uint64_t in[8],buf[3];
uint64_t in2[8],in3[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];} for (int i=0; i<8; i++)
in2[i] = c_PaddedMessage80[i+8];
uint32_t* Mess = (uint32_t*)in2; uint32_t* Mess = (uint32_t*)in2;
Mess[13]=nounce; Mess[13] = nounce;
uint64_t in3[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {in3[i]=0;} for (int i=0; i<8; i++)
in3[i]=0;
in3[7]=0x3d0; in3[7]=0x3d0;
#pragma unroll 3
for (int i=0;i<3;i++) {buf[i]=bufo[i];} uint64_t buf[3];
#pragma unroll 3
for (int i=0; i<3; i++)
buf[i]=bufo[i];
TIGER_ROUND_BODY(in2, buf); TIGER_ROUND_BODY(in2, buf);
TIGER_ROUND_BODY(in3, buf); TIGER_ROUND_BODY(in3, buf);
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}} for (int i=0;i<8;i++) {
} //// threads if (i<3) {
outputHash[i*threads+thread] = buf[i];
} else {
outputHash[i*threads+thread] = 0;
}
}
} // thread
} }
__host__
void tiger192_cpu_init(int thr_id, int threads) void tiger192_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
} }
__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) __host__
void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 640; // 256
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN size_t shared_size = 0;
// const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0;
m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__
__host__ void tiger192_setBlock_120(void *pdata) void tiger192_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x01; uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
#undef ROUND #undef ROUND
#undef MUL5 #undef MUL5
#undef MUL7 #undef MUL7
#undef MUL9 #undef MUL9
#define MUL5(x) ((x) * SPH_C64(5)) #define MUL5(x) ((x) * SPH_C64(5))
#define MUL7(x) ((x) * SPH_C64(7)) #define MUL7(x) ((x) * SPH_C64(7))
#define MUL9(x) ((x) * SPH_C64(9)) #define MUL9(x) ((x) * SPH_C64(9))
#define ROUND(a, b, c, x, mul) { \ #define ROUND(a, b, c, x, mul) { \
c ^= x; \ c ^= x; \
a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \ a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \
^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \ ^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
b = mul(b); \ b = mul(b); \
} }
uint64_t* alt_data = (uint64_t*) pdata; uint64_t* alt_data = (uint64_t*) pdata;
uint64_t in[8],buf[3]; uint64_t in[8],buf[3];
for (int i=0;i<8;i++) {in[i]= alt_data[i];}
for (int i=0;i<3;i++) {buf[i]=III[i];}
TIGER_ROUND_BODY(in, buf) for (int i=0;i<8;i++)
cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); in[i] = alt_data[i];
for (int i=0;i<3;i++)
buf[i] = III[i];
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }

6
m7/m7.cu

@ -22,7 +22,7 @@ extern "C"
extern int device_map[8]; extern int device_map[8];
extern bool opt_benchmark; extern bool opt_benchmark;
static uint64_t *d_hash[8]; //static uint64_t *d_hash[8];
static uint64_t *FinalHash[8]; static uint64_t *FinalHash[8];
static uint64_t *KeccakH[8]; static uint64_t *KeccakH[8];
static uint64_t *WhirlpoolH[8]; static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
{ {
// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool) // sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
char data_str[245], hash_str[65], target_str[65];
uint8_t *bdata = 0; uint8_t *bdata = 0;
mpz_t bns[7]; mpz_t bns[7];
mpz_t product; mpz_t product;
int rc = 0;
for(int i=0; i < 7; i++) { for(int i=0; i < 7; i++) {
mpz_init(bns[i]); mpz_init(bns[i]);
@ -292,7 +290,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
tiger192_setBlock_120((void*)pdata); tiger192_setBlock_120((void*)pdata);
cuda_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
uint32_t TheNonce = pdata[29];
do { do {
int order = 0; int order = 0;
@ -328,7 +325,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
MyStreamSynchronize(0,order++,thr_id); MyStreamSynchronize(0,order++,thr_id);
foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order); foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);

42
m7/m7_keccak512.cu

@ -5,6 +5,8 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
static __constant__ uint64_t stateo[25]; static __constant__ uint64_t stateo[25];
static __constant__ uint64_t RC[24]; static __constant__ uint64_t RC[24];
static const uint64_t cpu_RC[24] = { static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
0x0000000080000001ull, 0x8000000080008008ull 0x0000000080000001ull, 0x8000000080008008ull
}; };
static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { __device__ __forceinline__
static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i; size_t i;
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
} }
} }
__host__ __forceinline__
static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) { static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i; size_t i;
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
@ -204,19 +209,12 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
} }
} }
__global__ /* __launch_bounds__(256, 2) */
void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
uint64_t state[25]; uint64_t state[25];
@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
keccak_block(state,RC); keccak_block(state,RC);
#pragma unroll 8 #pragma unroll 8
for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];} for (int i=0;i<8;i++) {
outputHash[i*threads+thread] = state[i];
}
} //thread } //thread
} }
void m7_keccak512_cpu_init(int thr_id, int threads) void m7_keccak512_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
} }
__host__ void m7_keccak512_setBlock_120(void *pdata) __host__ void m7_keccak512_setBlock_120(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
uint8_t ending =0x01; uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122); memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1); memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); memset(PaddedMessage+123, 0, 5);
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
uint64_t* alt_data = (uint64_t*) pdata; uint64_t* alt_data = (uint64_t*) pdata;
uint64_t state[25]; uint64_t state[25];
for(int i=0;i<25;i++) {state[i]=0;} for(int i=0;i<9;i++)
state[i] = alt_data[i];
for(int i=10;i<25;i++)
for (int i=0;i<9;i++) {state[i] ^= alt_data[i];} state[i] = 0;
keccak_block_host(state,cpu_RC); keccak_block_host(state,cpu_RC);
cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }

Loading…
Cancel
Save