Browse Source

m7: forgot to indent some cuda files

and remove unused variables...
Tanguy Pruvot 10 years ago
parent
commit
fcd381cda2
  1. 518
      m7/cuda_m7_sha256.cu
  2. 479
      m7/cuda_mul2.cu
  3. 172
      m7/cuda_ripemd160.cu
  4. 141
      m7/cuda_tiger192.cu
  5. 40
      m7/m7.cu
  6. 54
      m7/m7_keccak512.cu

518
m7/cuda_m7_sha256.cu

@ -1,10 +1,10 @@ @@ -1,10 +1,10 @@
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include "sph/sph_types.h"
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//#define SPH_C64(x) ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = { @@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
};
static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_0(uint32_t x)
{
uint32_t r1 = SPH_ROTR32(x,2);
uint32_t r2 = SPH_ROTR32(x,13);
uint32_t r3 = SPH_ROTR32(x,22);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_1(uint32_t x)
{
uint32_t r1 = SPH_ROTR32(x,6);
uint32_t r2 = SPH_ROTR32(x,11);
uint32_t r3 = SPH_ROTR32(x,25);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_0(uint32_t x)
{
uint64_t r1 = SPH_ROTR32(x,7);
uint64_t r2 = SPH_ROTR32(x,18);
uint64_t r3 = shr_t32(x,3);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_1(uint32_t x)
{
uint64_t r1 = SPH_ROTR32(x,17);
uint64_t r2 = SPH_ROTR32(x,19);
@ -96,133 +103,130 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) @@ -96,133 +103,130 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
__device__ __forceinline__
static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
__host__ __forceinline__
static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
__device__ __forceinline__
static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 =bsg2_1(e);
uint32_t bsg20 =bsg2_0(a);
uint32_t andorv =andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
__host__ __forceinline__
static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv =((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21+inx2+ssg20+inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
__device__ __forceinline__
static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
uint32_t d=r[3];
uint32_t e=r[4];
uint32_t f=r[5];
uint32_t g=r[6];
uint32_t h=r[7];
sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++) {
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
uint32_t d=r[3];
uint32_t e=r[4];
uint32_t f=r[5];
uint32_t g=r[6];
uint32_t h=r[7];
sha2_step1(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,54 +243,50 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r @@ -239,54 +243,50 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
}
r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
r[3] = r[3] + d;
r[4] = r[4] + e;
r[5] = r[5] + f;
r[6] = r[6] + g;
r[7] = r[7] + h;
r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
r[3] = r[3] + d;
r[4] = r[4] + e;
r[5] = r[5] + f;
r[6] = r[6] + g;
r[7] = r[7] + h;
}
static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
__forceinline__
static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
uint32_t d=r[3];
uint32_t e=r[4];
uint32_t f=r[5];
uint32_t g=r[6];
uint32_t h=r[7];
sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++) {
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
uint32_t d=r[3];
uint32_t e=r[4];
uint32_t f=r[5];
uint32_t g=r[6];
uint32_t h=r[7];
sha2_step1_host(a,b,c,d,e,f,g,h,in[0],Kshared[0]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[1],Kshared[1]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[2],Kshared[2]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[3],Kshared[3]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[4],Kshared[4]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[5],Kshared[5]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[6],Kshared[6]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[7],Kshared[7]);
sha2_step1_host(a,b,c,d,e,f,g,h,in[8],Kshared[8]);
sha2_step1_host(h,a,b,c,d,e,f,g,in[9],Kshared[9]);
sha2_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++)
{
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,100 +303,64 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const @@ -303,100 +303,64 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
}
r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
r[3] = r[3] + d;
r[4] = r[4] + e;
r[5] = r[5] + f;
r[6] = r[6] + g;
r[7] = r[7] + h;
r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
r[3] = r[3] + d;
r[4] = r[4] + e;
r[5] = r[5] + f;
r[6] = r[6] + g;
r[7] = r[7] + h;
}
__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
//uint32_t buf[8];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread ; // original implementation
uint32_t buf[8];
uint32_t buf[8];
uint32_t in2[16]={0};
uint32_t in3[16]={0};
#pragma unroll 13
for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);}
#pragma unroll 13
for (int i=0; i<13; i++)
in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
in2[13]=cuda_swab32(nounce);
in2[14]=cuda_swab32(c_PaddedMessage80[30]);
in3[15]=0x3d0;
#pragma unroll 8
for (int i=0;i<8;i++) {buf[i]= pbuf[i];}
sha2_round_body(in2,buf,K);
sha2_round_body(in3,buf,K);
//#pragma unroll 8
//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
#pragma unroll 4
for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
in3[15]=0x3d0;
#pragma unroll 8
for (int i=0; i<8; i++)
buf[i] = pbuf[i];
//////////////////////////////////////////////////////////////////////////////////////////////////
} // threads
sha2_round_body(in2,buf,K);
sha2_round_body(in3,buf,K);
#pragma unroll 4
for (int i=0; i<4; i++) {
outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
}
} // thread
}
__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
__global__
void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
{
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
union {
uint8_t h1[304];
uint32_t h4[76];
uint64_t h8[38];
} hash;
uint32_t in[16],buf[8];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t in[16],buf[8];
#pragma unroll 8
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
#pragma unroll 8
#pragma unroll 8
for (int i=0;i<8;i++) {buf[i] = H256[i];}
sha2_round_body(in,buf,K);
@ -415,96 +379,87 @@ uint64_t h8[38]; @@ -415,96 +379,87 @@ uint64_t h8[38];
#pragma unroll 5
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
((uint64_t*)in)[5]= g_hash1[threads*(5+32)+thread];
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
in[11]=0;
in[12]=0;
in[13]=0;
in[14]=0;
in[15]=0x968;
in[15]=0x968;
int it=0;
int it=0;
do {
in[15]-=8;
it++;
} while (((uint8_t*)in)[44-it]==0);
((uint8_t*)in)[44-it+1]=0x80;
do {
in[15]-=8;
it++;
} while (((uint8_t*)in)[44-it]==0);
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
((uint8_t*)in)[44-it+1]=0x80;
sha2_round_body(in,buf,K);
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
uint32_t nounce = startNounce +thread;
bool rc = false;
sha2_round_body(in,buf,K);
uint32_t nounce = startNounce +thread;
bool rc = false;
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;}
// if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
rc = true;
else
rc = false;
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
}
}
if(rc == true)
{
if(resNounce[0] > nounce)
resNounce[0] = nounce;
}
////
} // threads
if (rc && resNounce[0] > nounce)
resNounce[0] = nounce;
} // thread
}
__host__ void m7_sha256_cpu_init(int thr_id, int threads)
__host__
void m7_sha256_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
}
__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
__host__
uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
{
const int threadsperblock = 384;
uint32_t result = 0xffffffff;
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
MyStreamSynchronize(NULL, order, thr_id);
result = *d_mnounce[thr_id];
return result;
}
__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 512;
const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// dim3 grid(1);
// dim3 block(1);
size_t shared_size = 0;
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun @@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
__host__
void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
{
unsigned char PaddedMessage[128];
uint8_t ending =0x80;
@ -527,6 +483,6 @@ __host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not use @@ -527,6 +483,6 @@ __host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not use
uint32_t in[16],buf[8];
for (int i=0;i<16;i++) {in[i]= host_swab32(alt_data[i]);}
for (int i=0;i<8;i++) {buf[i]= cpu_H256[i];}
sha2_round_body_host(in,buf,cpu_K);
cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
sha2_round_body_host(in,buf,cpu_K);
cudaMemcpyToSymbol( pbuf, buf, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

479
m7/cuda_mul2.cu

@ -52,29 +52,29 @@ typedef struct t4_t{ @@ -52,29 +52,29 @@ typedef struct t4_t{
} t4_t;
__device__ __forceinline__
ulonglong2 umul64wide (unsigned long long int a,
unsigned long long int b)
ulonglong2 umul64wide(unsigned long long int a,
unsigned long long int b)
{
ulonglong2 res;
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2; \n\t"
"mov.b64 {blo,bhi}, %3; \n\t"
"mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
return res;
ulonglong2 res;
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2; \n\t"
"mov.b64 {blo,bhi}, %3; \n\t"
"mul.lo.u32 r0, alo, blo; \n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0; \n\t"
"mov.b64 %0, {r0,r1}; \n\t"
"mov.b64 %1, {r2,r3}; \n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
return res;
}
#define umul_ppmm(h,l,m,n) \
@ -85,75 +85,76 @@ ulonglong2 umul64wide (unsigned long long int a, @@ -85,75 +85,76 @@ ulonglong2 umul64wide (unsigned long long int a,
}
__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
__device__ __forceinline__
void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
{
asm ("{\n\t"
".reg .u32 o0, o1, o2, o3, o4; \n\t"
".reg .u32 o5, o6, o7, i8, i9; \n\t"
".reg .u32 i10, i11, i12, i13; \n\t"
".reg .u32 i14, i15, i16, i17; \n\t"
".reg .u32 i18, i19, i20, i21; \n\t"
".reg .u32 i22, i23; \n\t"
"mov.b64 { i8, i9}, %4; \n\t"
"mov.b64 {i10,i11}, %5; \n\t"
"mov.b64 {i12,i13}, %6; \n\t"
"mov.b64 {i14,i15}, %7; \n\t"
"mov.b64 {i16,i17}, %8; \n\t"
"mov.b64 {i18,i19}, %9; \n\t"
"mov.b64 {i20,i21},%10; \n\t"
"mov.b64 {i22,i23},%11; \n\t"
"mul.lo.u32 o0, i8, i16; \n\t"
"mul.hi.u32 o1, i8, i16; \n\t"
"mad.lo.cc.u32 o1, i8, i17, o1;\n\t"
"madc.hi.u32 o2, i8, i17, 0;\n\t"
"mad.lo.cc.u32 o1, i9, i16, o1;\n\t"
"madc.hi.cc.u32 o2, i9, i16, o2;\n\t"
"madc.hi.u32 o3, i8, i18, 0;\n\t"
"mad.lo.cc.u32 o2, i8, i18, o2;\n\t"
"madc.hi.cc.u32 o3, i9, i17, o3;\n\t"
"madc.hi.u32 o4, i8, i19, 0;\n\t"
"mad.lo.cc.u32 o2, i9, i17, o2;\n\t"
"madc.hi.cc.u32 o3, i10, i16, o3;\n\t"
"madc.hi.cc.u32 o4, i9, i18, o4;\n\t"
"addc.u32 o5, 0, 0;\n\t"
"mad.lo.cc.u32 o2, i10, i16, o2;\n\t"
"madc.lo.cc.u32 o3, i8, i19, o3;\n\t"
"madc.hi.cc.u32 o4, i10, i17, o4;\n\t"
"madc.hi.cc.u32 o5, i9, i19, o5;\n\t"
"addc.u32 o6, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i9, i18, o3;\n\t"
"madc.hi.cc.u32 o4, i11, i16, o4;\n\t"
"madc.hi.cc.u32 o5, i10, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o3, i10, i17, o3;\n\t"
"addc.u32 o4, 0, o4;\n\t"
"mad.hi.cc.u32 o5, i11, i17, o5;\n\t"
"madc.hi.cc.u32 o6, i10, i19, o6;\n\t"
"addc.u32 o7, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i11, i16, o3;\n\t"
"madc.lo.cc.u32 o4, i9, i19, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.hi.cc.u32 o6, i11, i18, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t"
"mad.lo.cc.u32 o4, i10, i18, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.hi.u32 o7, i11, i19, o7;\n\t"
"mad.lo.cc.u32 o4, i11, i17, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.lo.cc.u32 o5, i10, i19, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o5, i11, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o6, i11, i19, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t"
"mov.b64 %0, {o0,o1}; \n\t"
"mov.b64 %1, {o2,o3}; \n\t"
"mov.b64 %2, {o4,o5}; \n\t"
"mov.b64 %3, {o6,o7}; \n\t"
"}"
: "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high)
: "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL),
"l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL));
asm ("{\n\t"
".reg .u32 o0, o1, o2, o3, o4; \n\t"
".reg .u32 o5, o6, o7, i8, i9; \n\t"
".reg .u32 i10, i11, i12, i13; \n\t"
".reg .u32 i14, i15, i16, i17; \n\t"
".reg .u32 i18, i19, i20, i21; \n\t"
".reg .u32 i22, i23; \n\t"
"mov.b64 { i8, i9}, %4; \n\t"
"mov.b64 {i10,i11}, %5; \n\t"
"mov.b64 {i12,i13}, %6; \n\t"
"mov.b64 {i14,i15}, %7; \n\t"
"mov.b64 {i16,i17}, %8; \n\t"
"mov.b64 {i18,i19}, %9; \n\t"
"mov.b64 {i20,i21},%10; \n\t"
"mov.b64 {i22,i23},%11; \n\t"
"mul.lo.u32 o0, i8, i16; \n\t"
"mul.hi.u32 o1, i8, i16; \n\t"
"mad.lo.cc.u32 o1, i8, i17, o1;\n\t"
"madc.hi.u32 o2, i8, i17, 0;\n\t"
"mad.lo.cc.u32 o1, i9, i16, o1;\n\t"
"madc.hi.cc.u32 o2, i9, i16, o2;\n\t"
"madc.hi.u32 o3, i8, i18, 0;\n\t"
"mad.lo.cc.u32 o2, i8, i18, o2;\n\t"
"madc.hi.cc.u32 o3, i9, i17, o3;\n\t"
"madc.hi.u32 o4, i8, i19, 0;\n\t"
"mad.lo.cc.u32 o2, i9, i17, o2;\n\t"
"madc.hi.cc.u32 o3, i10, i16, o3;\n\t"
"madc.hi.cc.u32 o4, i9, i18, o4;\n\t"
"addc.u32 o5, 0, 0;\n\t"
"mad.lo.cc.u32 o2, i10, i16, o2;\n\t"
"madc.lo.cc.u32 o3, i8, i19, o3;\n\t"
"madc.hi.cc.u32 o4, i10, i17, o4;\n\t"
"madc.hi.cc.u32 o5, i9, i19, o5;\n\t"
"addc.u32 o6, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i9, i18, o3;\n\t"
"madc.hi.cc.u32 o4, i11, i16, o4;\n\t"
"madc.hi.cc.u32 o5, i10, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o3, i10, i17, o3;\n\t"
"addc.u32 o4, 0, o4;\n\t"
"mad.hi.cc.u32 o5, i11, i17, o5;\n\t"
"madc.hi.cc.u32 o6, i10, i19, o6;\n\t"
"addc.u32 o7, 0, 0;\n\t"
"mad.lo.cc.u32 o3, i11, i16, o3;\n\t"
"madc.lo.cc.u32 o4, i9, i19, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.hi.cc.u32 o6, i11, i18, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t"
"mad.lo.cc.u32 o4, i10, i18, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.hi.u32 o7, i11, i19, o7;\n\t"
"mad.lo.cc.u32 o4, i11, i17, o4;\n\t"
"addc.u32 o5, 0, o5;\n\t"
"mad.lo.cc.u32 o5, i10, i19, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o5, i11, i18, o5;\n\t"
"addc.u32 o6, 0, o6;\n\t"
"mad.lo.cc.u32 o6, i11, i19, o6;\n\t"
"addc.u32 o7, 0, o7;\n\t"
"mov.b64 %0, {o0,o1}; \n\t"
"mov.b64 %1, {o2,o3}; \n\t"
"mov.b64 %2, {o4,o5}; \n\t"
"mov.b64 %3, {o6,o7}; \n\t"
"}"
: "=l"(l->low), "=l"(l->high), "=l"(h->low), "=l"(h->high)
: "l"(m.low), "l"(m.high), "l"(0ULL), "l"(0ULL),
"l"(n.low), "l"(n.high), "l"(0ULL), "l"(0ULL));
}
#if 0
@ -187,55 +188,60 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){ @@ -187,55 +188,60 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
#endif
__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
__device__ __forceinline__
t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
t4_t ret;
ret.high = g[(idx*2 + 1)*threads + thread];
ret.low = g[(idx*2)*threads + thread];
if(thread==0){
// cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low);
// cuPrintf("Load Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, ret.high>>32, ret.high, ret.low>>32, ret.low);
}
return ret;
}
__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
__device__ __forceinline__
void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
g[(idx*2 + 1)*threads + thread]=val.high;
g[(idx*2)*threads + thread]=val.low;
if(thread==0){
// cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low);
// cuPrintf("Store Idx: %d %8.8X %8.8X %8.8X %8.8X\n", idx, val.high>>32, val.high, val.low>>32, val.low);
}
}
__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){
__device__ __forceinline__
void T4_set(t4_t *d, uint64_t v){
d->high = 0;
d->low = v;
}
__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
__device__ __forceinline__
t4_t T4_add(t4_t a, t4_t b){
t4_t ret;
uint32_t c=0;
ret.low = a.low + b.low;
if(ret.low < a.low)
c=1;
c=1;
ret.high = a.high + b.high + c;
return ret;
}
__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
__device__ __forceinline__
t4_t T4_add(uint64_t a, t4_t b){
t4_t ret;
uint32_t c=0;
ret.low = a + b.low;
if(ret.low < a)
c=1;
c=1;
ret.high = b.high + c;
return ret;
}
__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
__device__ __forceinline__
uint32_t T4_lt(t4_t a, t4_t b){
if(a.high < b.high)
return 1;
if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ @@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
return 0;
}
__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
__device__ __forceinline__
uint32_t T4_gt(t4_t a, uint64_t b){
if(a.high)
return 1;
if(a.low > b)
@ -252,217 +259,213 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ @@ -252,217 +259,213 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
}
__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
t4_t ul, cl, hpl, lpl;
uint32_t i;
T4_set(&cl,0);
for(i=0; i < len; i++) {
ul = T4(thread,threads,i,g_v);
umul_ppmmT4 (&hpl, &lpl, ul, sml);
__device__
void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
t4_t ul, cl, hpl, lpl;
uint32_t i;
T4_set(&cl,0);
for(i=0; i < len; i++) {
ul = T4(thread,threads,i,g_v);
umul_ppmmT4 (&hpl, &lpl, ul, sml);
lpl = T4_add(lpl,cl);
cl = T4_add(T4_lt(lpl,cl),hpl);
lpl = T4_add(lpl,cl);
cl = T4_add(T4_lt(lpl,cl),hpl);
T4_store(thread,threads,i,g_p,lpl);
}
T4_store(thread,threads,i,g_p,lpl);
}
T4_store(thread,threads,len,g_p,cl);
*size = len + T4_gt(cl,0);
T4_store(thread,threads,len,g_p,cl);
*size = len + T4_gt(cl,0);
}
__device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, uint64_t sml, uint32_t *size){
uint64_t ul, cl, hpl, lpl;
uint32_t i;
cl = 0;
for(i=0; i < len; i++) {
ul = g_v[i*threads + thread];
umul_ppmm (hpl, lpl, ul, sml);
uint64_t ul, cl, hpl, lpl;
uint32_t i;
cl = 0;
for(i=0; i < len; i++) {
ul = g_v[i*threads + thread];
umul_ppmm (hpl, lpl, ul, sml);
lpl += cl;
cl = (lpl < cl) + hpl;
lpl += cl;
cl = (lpl < cl) + hpl;
g_p[i*threads + thread] = lpl;
}
g_p[i*threads + thread] = lpl;
}
g_p[len*threads + thread] = cl;
*size = len + (cl != 0);
g_p[len*threads + thread] = cl;
*size = len + (cl != 0);
}
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
{
uint64_t carry=0;
uint32_t i;
uint64_t ul,lpl,hpl,rl;
for(i=0; i < xsz; i++){
for(i=0; i < xsz; i++)
{
ul = x[i*threads + thread];
umul_ppmm (hpl, lpl, ul, a);
ul = x[i*threads + thread];
umul_ppmm (hpl, lpl, ul, a);
lpl += carry;
carry = (lpl < carry) + hpl;
lpl += carry;
carry = (lpl < carry) + hpl;
rl = sum[(i+sofst) * threads + thread];
lpl = rl + lpl;
carry += lpl < rl;
sum[(i+sofst)*threads + thread] = lpl;
}
rl = sum[(i+sofst) * threads + thread];
lpl = rl + lpl;
carry += lpl < rl;
sum[(i+sofst)*threads + thread] = lpl;
}
return carry;
return carry;
}
t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){
__device__
t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
{
t4_t carry;
uint32_t i;
t4_t ul,lpl,hpl,rl;
T4_set(&carry,0);
for(i=0; i < xsz; i++){
ul = T4(thread,threads,i,x);
umul_ppmmT4 (&hpl, &lpl, ul, a);
lpl = T4_add(lpl,carry);
carry = T4_add(T4_lt(lpl,carry), hpl);
rl = T4(thread,threads,i+sofst,sum);
lpl = T4_add(rl,lpl);
carry = T4_add(T4_lt(lpl,rl),carry);
T4_store(thread,threads,i+sofst,sum,lpl);
}
for(i=0; i < xsz; i++)
{
ul = T4(thread,threads,i,x);
umul_ppmmT4 (&hpl, &lpl, ul, a);
lpl = T4_add(lpl,carry);
carry = T4_add(T4_lt(lpl,carry), hpl);
rl = T4(thread,threads,i+sofst,sum);
lpl = T4_add(rl,lpl);
carry = T4_add(T4_lt(lpl,rl),carry);
T4_store(thread,threads,i+sofst,sum,lpl);
}
return carry;
return carry;
}
__global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__
void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
if(ulegs < vlegs){
uint64_t t1=ulegs;
ulegs = vlegs;
vlegs = t1;
uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
uint32_t vofst=1,rofst=1,psize=0;
mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
if(ulegs < vlegs) {
uint64_t t1=ulegs;
ulegs = vlegs;
vlegs = t1;
uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
uint32_t vofst=1,rofst=1,psize=0;
mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
#if 1
while (vofst < vlegs) {
while (vofst < vlegs) {
//clear high word //TODO: right
// printf("Size: %d\n", rp->size[tid]);
g_p[(psize+0)*threads+thread] = 0;
g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]);
vofst++; rofst++;
psize++;
}
// printf("Size: %d\n", rp->size[tid]);
g_p[(psize+0)*threads + thread] = 0;
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
// rp->size[tid]++;
g_p[(ulegs+rofst)*threads + thread] = addmul_1g (thread, threads, g_p ,rofst , g_u, ulegs, g_v[vofst*threads+thread]);
vofst++; rofst++;
psize++;
}
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
// rp->size[tid]++;
#endif
}
}
}
__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__
void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
if(ulegs < vlegs){ ///everything written the other way around... are you kidding me ?!
uint64_t t1=ulegs;
ulegs = vlegs;
vlegs = t1;
uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
if(ulegs < vlegs) { // everything written the other way around... are you kidding me ?!
uint64_t t1=ulegs;
ulegs = vlegs;
vlegs = t1;
ulegs >>= 1; vlegs >>= 1;
uint64_t *t2 = g_u;
g_u = g_v;
g_v = t2;
}
if(thread == 0){
// cuPrintf("U: %d V: %d\n", ulegs, vlegs);
}
ulegs >>= 1; vlegs >>= 1;
if(thread == 0) {
// cuPrintf("U: %d V: %d\n", ulegs, vlegs);
}
uint32_t vofst=1,rofst=1,psize=0;
mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
uint32_t vofst=1,rofst=1,psize=0;
mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
#if 1
t4_t zero;
T4_set(&zero,0);
t4_t zero;
T4_set(&zero,0);
// while (vofst < vlegs) {
// while (vofst < vlegs) {
#pragma unroll
for (vofst=1;vofst<vlegs;vofst++) {
T4_store(thread,threads,psize,g_p,zero);
T4_store(thread,threads,ulegs+rofst,g_p,addmul_1gT4 (thread, threads, g_p ,rofst , g_u, ulegs,T4(thread,threads,vofst,g_v)));
// vofst++;
#pragma unroll
for (vofst=1;vofst<vlegs;vofst++)
{
T4_store(thread,threads,psize,g_p,zero);
T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
// vofst++;
rofst++;
psize++;
}
psize++;
}
#endif
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
__host__
void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
{
const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size =0;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
size_t shared_size = 0;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
}
__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
__host__
void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
{
const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size =0;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
size_t shared_size = 0;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
}
__host__ void mul_init(){
__host__
void mul_init()
{
}

172
m7/cuda_ripemd160.cu

@ -48,7 +48,7 @@ @@ -48,7 +48,7 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
static __constant__ uint32_t gpu_IV[5];
static __constant__ uint32_t bufo[5];
static const uint32_t IV[5] = {
@ -282,118 +282,116 @@ static const uint32_t IV[5] = { @@ -282,118 +282,116 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \
}
__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
#pragma unroll 16
for (int i=0;i<16;i++) {
if ((i+16) < 29)
in2[i] = c_PaddedMessage80[i+16];
else if ((i+16)==29)
in2[i] = nounce;
else if ((i+16)==30)
in2[i] = c_PaddedMessage80[i+16];
else
in2[i] = 0;
}
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread ;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
// #pragma unroll 16
// for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
#pragma unroll 16
for (int i=0;i<16;i++) {if ((i+16)<29) {in2[i]= c_PaddedMessage80[i+16];}
else if ((i+16)==29) {in2[i]= nounce;}
else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];}
else {in2[i]= 0;}}
#pragma unroll 16
for (int i=0;i<16;i++) {in3[i]=0;}
in3[14]=0x3d0;
// #pragma unroll 5
// for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
#pragma unroll 5
for (int i=0;i<5;i++) {buf[i]=bufo[i];}
// RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf);
hash.h4[5]=0;
#pragma unroll 5
for (int i=0;i<5;i++)
{hash.h4[i]=buf[i];
}
//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
//#pragma unroll 3
//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];}
#pragma unroll 3
for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];}
//#pragma unroll 8
//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
}
for (int i=0;i<16;i++)
in3[i]=0;
in3[14]=0x3d0;
#pragma unroll 5
for (int i=0;i<5;i++)
buf[i]=bufo[i];
RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf);
hash.h4[5]=0;
#pragma unroll 5
for (int i=0; i<5; i++)
hash.h4[i]=buf[i];
#pragma unroll 3
for (int i=0;i<3;i++) {
outputHash[i*threads+thread] = hash.h8[i];
}
}
}
void ripemd160_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
}
__host__ void ripemd160_setBlock_120(void *pdata)
__host__
void ripemd160_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x80;
memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#undef F1
#undef F2
#undef F3
#undef F4
#undef F5
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
uint32_t* alt_data =(uint32_t*)pdata;
uint32_t in[16],buf[5];
uint32_t in[16],buf[5];
for (int i=0;i<16;i++)
in[i]= alt_data[i];
for (int i=0;i<16;i++) {in[i]= alt_data[i];}
for (int i=0;i<5;i++)
buf[i]=IV[i];
for (int i=0;i<5;i++) {buf[i]=IV[i];}
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}
__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 256;
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0;
m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);

141
m7/cuda_tiger192.cu

@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__constant__ uint64_t bufo[3];
static __constant__ uint64_t gpu_III[3];
static __constant__ uint64_t T1[256];
static __constant__ uint64_t T2[256];
static __constant__ uint64_t T3[256];
static __constant__ uint64_t T4[256];
static const uint64_t III[3] = {
SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
};
@ -583,16 +585,16 @@ static const uint64_t cpu_T4[256] = { @@ -583,16 +585,16 @@ static const uint64_t cpu_T4[256] = {
SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F)
};
#define PASS(a, b, c, mul) { \
ROUND(a, b, c, X0, mul); \
ROUND(b, c, a, X1, mul); \
ROUND(c, a, b, X2, mul); \
ROUND(a, b, c, X3, mul); \
ROUND(b, c, a, X4, mul); \
ROUND(c, a, b, X5, mul); \
ROUND(a, b, c, X6, mul); \
ROUND(b, c, a, X7, mul); \
}
#define PASS(a, b, c, mul) { \
ROUND(a, b, c, X0, mul); \
ROUND(b, c, a, X1, mul); \
ROUND(c, a, b, X2, mul); \
ROUND(a, b, c, X3, mul); \
ROUND(b, c, a, X4, mul); \
ROUND(c, a, b, X5, mul); \
ROUND(a, b, c, X6, mul); \
ROUND(b, c, a, X7, mul); \
}
#define MUL5(x) SPH_T64((x) * SPH_C64(5))
#define MUL7(x) SPH_T64((x) * SPH_C64(7))
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = { @@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
(r)[2] = SPH_T64(C + (r)[2]); \
}
__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
__shared__ uint64_t sharedMem[1024];
__shared__ uint64_t sharedMem[1024];
if(threadIdx.x < 256)
{
if(threadIdx.x < 256) {
sharedMem[threadIdx.x] = T1[threadIdx.x];
sharedMem[threadIdx.x+256] = T2[threadIdx.x];
sharedMem[threadIdx.x+512] = T3[threadIdx.x];
sharedMem[threadIdx.x+768] = T4[threadIdx.x];
}
__syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
/*
#undef MUL5
#undef MUL7
@ -680,7 +677,7 @@ uint64_t h8[8]; @@ -680,7 +677,7 @@ uint64_t h8[8];
#define MUL7(x) mul(x,7)
#define MUL9(x) mul(x,9)
*/
#define PASS(a, b, c, mul) { \
#define PASS(a, b, c, mul) { \
ROUND(a, b, c, X0, mul); \
ROUND(b, c, a, X1, mul); \
ROUND(c, a, b, X2, mul); \
@ -691,89 +688,95 @@ uint64_t h8[8]; @@ -691,89 +688,95 @@ uint64_t h8[8];
ROUND(b, c, a, X7, mul); \
}
#define ROUND(a, b, c, x, mul) { \
#define ROUND(a, b, c, x, mul) { \
c ^= x; \
a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \
^ sharedMem[((c >> 32) & 0xFF)+512] ^ sharedMem[((c >> 48) & 0xFF)+768])); \
b = SPH_T64(b + (sharedMem[((c >> 8) & 0xFF)+768] ^ sharedMem[((c >> 24) & 0xFF)+512] \
^ sharedMem[((c >> 40) & 0xFF)+256] ^ sharedMem[(c >> 56) & 0xFF])); \
b = mul(b); \
}
uint64_t in[8],buf[3];
uint64_t in2[8],in3[8];
uint64_t in2[8];
#pragma unroll 8
for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];}
for (int i=0; i<8; i++)
in2[i] = c_PaddedMessage80[i+8];
uint32_t* Mess = (uint32_t*)in2;
Mess[13]=nounce;
Mess[13] = nounce;
uint64_t in3[8];
#pragma unroll 8
for (int i=0;i<8;i++) {in3[i]=0;}
for (int i=0; i<8; i++)
in3[i]=0;
in3[7]=0x3d0;
#pragma unroll 3
for (int i=0;i<3;i++) {buf[i]=bufo[i];}
uint64_t buf[3];
#pragma unroll 3
for (int i=0; i<3; i++)
buf[i]=bufo[i];
TIGER_ROUND_BODY(in2, buf);
TIGER_ROUND_BODY(in3, buf);
TIGER_ROUND_BODY(in2, buf);
TIGER_ROUND_BODY(in3, buf);
#pragma unroll 8
for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}}
} //// threads
#pragma unroll 8
for (int i=0;i<8;i++) {
if (i<3) {
outputHash[i*threads+thread] = buf[i];
} else {
outputHash[i*threads+thread] = 0;
}
}
} // thread
}
__host__
void tiger192_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
}
__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 640; // 256
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN
// const int threadsperblock = 256;
size_t shared_size = 0;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0;
m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void tiger192_setBlock_120(void *pdata)
__host__
void tiger192_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
#undef ROUND
#undef MUL5
#undef MUL7
#undef MUL9
#define MUL5(x) ((x) * SPH_C64(5))
#define MUL7(x) ((x) * SPH_C64(7))
#define MUL9(x) ((x) * SPH_C64(9))
#undef ROUND
#undef MUL5
#undef MUL7
#undef MUL9
#define MUL5(x) ((x) * SPH_C64(5))
#define MUL7(x) ((x) * SPH_C64(7))
#define MUL9(x) ((x) * SPH_C64(9))
#define ROUND(a, b, c, x, mul) { \
#define ROUND(a, b, c, x, mul) { \
c ^= x; \
a = SPH_T64(a - (cpu_T1[c & 0xFF] ^ cpu_T2[(c >> 16) & 0xFF] \
^ cpu_T3[(c >> 32) & 0xFF] ^ cpu_T4[(c >> 48) & 0xFF])); \
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata) @@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
b = mul(b); \
}
uint64_t* alt_data = (uint64_t*) pdata;
uint64_t in[8],buf[3];
for (int i=0;i<8;i++) {in[i]= alt_data[i];}
for (int i=0;i<3;i++) {buf[i]=III[i];}
uint64_t in[8],buf[3];
for (int i=0;i<8;i++)
in[i] = alt_data[i];
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
for (int i=0;i<3;i++)
buf[i] = III[i];
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}

40
m7/m7.cu

@ -22,7 +22,7 @@ extern "C" @@ -22,7 +22,7 @@ extern "C"
extern int device_map[8];
extern bool opt_benchmark;
static uint64_t *d_hash[8];
//static uint64_t *d_hash[8];
static uint64_t *FinalHash[8];
static uint64_t *KeccakH[8];
static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de @@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
{
// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
char data_str[245], hash_str[65], target_str[65];
uint8_t *bdata = 0;
mpz_t bns[7];
mpz_t product;
int rc = 0;
for(int i=0; i < 7; i++) {
mpz_init(bns[i]);
@ -292,44 +290,42 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata, @@ -292,44 +290,42 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
tiger192_setBlock_120((void*)pdata);
cuda_check_cpu_setTarget(ptarget);
uint32_t TheNonce = pdata[29];
do {
int order = 0;
uint32_t foundNonce;
m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);
m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);
m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);
m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);
m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);
m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);
m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++);
m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++);
m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);
m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);
m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);
m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);
cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112
MyStreamSynchronize(0,order++,thr_id);
cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //112
MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);
if (foundNonce != 0xffffffff)

54
m7/m7_keccak512.cu

@ -5,6 +5,8 @@ @@ -5,6 +5,8 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
static __constant__ uint64_t stateo[25];
static __constant__ uint64_t RC[24];
static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = { @@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
0x0000000080000001ull, 0x8000000080008008ull
};
static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
__device__ __forceinline__
static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t @@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
}
}
static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) {
__host__ __forceinline__
static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint64_t t[5], u[5], v, w;
@ -204,25 +209,18 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca @@ -204,25 +209,18 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
}
}
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__ /* __launch_bounds__(256, 2) */
void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint64_t state[25];
uint64_t state[25];
#pragma unroll 16
for (int i=9;i<25;i++) {state[i]=stateo[i];}
for (int i=9;i<25;i++) {state[i]=stateo[i];}
state[0] = xor1(stateo[0],c_PaddedMessage80[9]);
state[1] = xor1(stateo[1],c_PaddedMessage80[10]);
@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin @@ -236,39 +234,37 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
keccak_block(state,RC);
#pragma unroll 8
for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];}
#pragma unroll 8
for (int i=0;i<8;i++) {
outputHash[i*threads+thread] = state[i];
}
} //thread
}
void m7_keccak512_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
}
__host__ void m7_keccak512_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5);
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
uint64_t* alt_data = (uint64_t*) pdata;
uint64_t state[25];
for(int i=0;i<25;i++) {state[i]=0;}
for (int i=0;i<9;i++) {state[i] ^= alt_data[i];}
keccak_block_host(state,cpu_RC);
cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
uint64_t state[25];
for(int i=0;i<9;i++)
state[i] = alt_data[i];
for(int i=10;i<25;i++)
state[i] = 0;
keccak_block_host(state,cpu_RC);
cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}

Loading…
Cancel
Save