|
|
@ -1,10 +1,10 @@ |
|
|
|
#include <stdio.h> |
|
|
|
#include <stdio.h> |
|
|
|
#include <memory.h> |
|
|
|
#include <memory.h> |
|
|
|
|
|
|
|
|
|
|
|
#include "cuda_helper.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "sph/sph_types.h" |
|
|
|
#include "sph/sph_types.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "cuda_helper.h" |
|
|
|
|
|
|
|
|
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
|
|
|
|
|
|
|
|
//#define SPH_C64(x) ((uint64_t)(x ## ULL)) |
|
|
|
//#define SPH_C64(x) ((uint64_t)(x ## ULL)) |
|
|
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = { |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x) |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static uint32_t bsg2_0(uint32_t x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t r1 = SPH_ROTR32(x,2); |
|
|
|
uint32_t r1 = SPH_ROTR32(x,2); |
|
|
|
uint32_t r2 = SPH_ROTR32(x,13); |
|
|
|
uint32_t r2 = SPH_ROTR32(x,13); |
|
|
|
uint32_t r3 = SPH_ROTR32(x,22); |
|
|
|
uint32_t r3 = SPH_ROTR32(x,22); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
} |
|
|
|
} |
|
|
|
static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static uint32_t bsg2_1(uint32_t x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t r1 = SPH_ROTR32(x,6); |
|
|
|
uint32_t r1 = SPH_ROTR32(x,6); |
|
|
|
uint32_t r2 = SPH_ROTR32(x,11); |
|
|
|
uint32_t r2 = SPH_ROTR32(x,11); |
|
|
|
uint32_t r3 = SPH_ROTR32(x,25); |
|
|
|
uint32_t r3 = SPH_ROTR32(x,25); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
} |
|
|
|
} |
|
|
|
static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static uint32_t ssg2_0(uint32_t x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint64_t r1 = SPH_ROTR32(x,7); |
|
|
|
uint64_t r1 = SPH_ROTR32(x,7); |
|
|
|
uint64_t r2 = SPH_ROTR32(x,18); |
|
|
|
uint64_t r2 = SPH_ROTR32(x,18); |
|
|
|
uint64_t r3 = shr_t32(x,3); |
|
|
|
uint64_t r3 = shr_t32(x,3); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
} |
|
|
|
} |
|
|
|
static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static uint32_t ssg2_1(uint32_t x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint64_t r1 = SPH_ROTR32(x,17); |
|
|
|
uint64_t r1 = SPH_ROTR32(x,17); |
|
|
|
uint64_t r2 = SPH_ROTR32(x,19); |
|
|
|
uint64_t r2 = SPH_ROTR32(x,19); |
|
|
@ -96,7 +103,8 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
return xor3b(r1,r2,r3); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
uint32_t in,const uint32_t Kshared) |
|
|
|
uint32_t in,const uint32_t Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t t1,t2; |
|
|
|
uint32_t t1,t2; |
|
|
@ -111,12 +119,10 @@ d = d + t1; |
|
|
|
h = t1 + t2; |
|
|
|
h = t1 + t2; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
__host__ __forceinline__ |
|
|
|
|
|
|
|
static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
uint32_t in,const uint32_t Kshared) |
|
|
|
uint32_t in,const uint32_t Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t t1,t2; |
|
|
|
uint32_t t1,t2; |
|
|
|
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
|
|
|
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
|
|
|
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); |
|
|
|
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); |
|
|
@ -129,7 +135,8 @@ d = d + t1; |
|
|
|
h = t1 + t2; |
|
|
|
h = t1 + t2; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
uint32_t* in,uint32_t pc,const uint32_t Kshared) |
|
|
|
uint32_t* in,uint32_t pc,const uint32_t Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t t1,t2; |
|
|
|
uint32_t t1,t2; |
|
|
@ -156,10 +163,10 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc]; |
|
|
|
t2 = bsg20 + andorv; |
|
|
|
t2 = bsg20 + andorv; |
|
|
|
d = d + t1; |
|
|
|
d = d + t1; |
|
|
|
h = t1 + t2; |
|
|
|
h = t1 + t2; |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
__host__ __forceinline__ |
|
|
|
|
|
|
|
static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, |
|
|
|
uint32_t* in,uint32_t pc,const uint32_t Kshared) |
|
|
|
uint32_t* in,uint32_t pc,const uint32_t Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t t1,t2; |
|
|
|
uint32_t t1,t2; |
|
|
@ -172,7 +179,6 @@ uint32_t inx1 = in[pcidx1]; |
|
|
|
uint32_t inx2 = in[pcidx2]; |
|
|
|
uint32_t inx2 = in[pcidx2]; |
|
|
|
uint32_t inx3 = in[pcidx3]; |
|
|
|
uint32_t inx3 = in[pcidx3]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); |
|
|
|
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); |
|
|
|
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); |
|
|
|
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); |
|
|
|
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
|
|
|
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
|
|
@ -186,14 +192,12 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc]; |
|
|
|
t2 = bsg20 + andorv; |
|
|
|
t2 = bsg20 + andorv; |
|
|
|
d = d + t1; |
|
|
|
d = d + t1; |
|
|
|
h = t1 + t2; |
|
|
|
h = t1 + t2; |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t a=r[0]; |
|
|
|
uint32_t a=r[0]; |
|
|
|
uint32_t b=r[1]; |
|
|
|
uint32_t b=r[1]; |
|
|
|
uint32_t c=r[2]; |
|
|
|
uint32_t c=r[2]; |
|
|
@ -221,8 +225,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r |
|
|
|
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); |
|
|
|
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 3 |
|
|
|
#pragma unroll 3 |
|
|
|
for (int i=0;i<3;i++) { |
|
|
|
for (int i=0;i<3;i++) |
|
|
|
|
|
|
|
{ |
|
|
|
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); |
|
|
|
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); |
|
|
|
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); |
|
|
|
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); |
|
|
|
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); |
|
|
|
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); |
|
|
@ -239,11 +243,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r |
|
|
|
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
|
|
|
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
|
|
|
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
|
|
|
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
|
|
|
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
|
|
|
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r[0] = r[0] + a; |
|
|
|
r[0] = r[0] + a; |
|
|
|
r[1] = r[1] + b; |
|
|
|
r[1] = r[1] + b; |
|
|
|
r[2] = r[2] + c; |
|
|
|
r[2] = r[2] + c; |
|
|
@ -254,10 +255,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r |
|
|
|
r[7] = r[7] + h; |
|
|
|
r[7] = r[7] + h; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) |
|
|
|
__forceinline__ |
|
|
|
|
|
|
|
static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t a=r[0]; |
|
|
|
uint32_t a=r[0]; |
|
|
|
uint32_t b=r[1]; |
|
|
|
uint32_t b=r[1]; |
|
|
|
uint32_t c=r[2]; |
|
|
|
uint32_t c=r[2]; |
|
|
@ -285,8 +285,8 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const |
|
|
|
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); |
|
|
|
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 3 |
|
|
|
#pragma unroll 3 |
|
|
|
for (int i=0;i<3;i++) { |
|
|
|
for (int i=0;i<3;i++) |
|
|
|
|
|
|
|
{ |
|
|
|
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); |
|
|
|
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]); |
|
|
|
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); |
|
|
|
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]); |
|
|
|
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); |
|
|
|
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]); |
|
|
@ -303,7 +303,6 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const |
|
|
|
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
|
|
|
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
|
|
|
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
|
|
|
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
|
|
|
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
|
|
|
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
r[0] = r[0] + a; |
|
|
|
r[0] = r[0] + a; |
|
|
@ -316,27 +315,12 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const |
|
|
|
r[7] = r[7] + h; |
|
|
|
r[7] = r[7] + h; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ |
|
|
|
__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) |
|
|
|
void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash) |
|
|
|
{ |
|
|
|
{ |
|
|
|
/* |
|
|
|
|
|
|
|
__shared__ uint32_t Kshared[64]; |
|
|
|
|
|
|
|
if (threadIdx.x < 64) { |
|
|
|
|
|
|
|
Kshared[threadIdx.x]=K[threadIdx.x]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
union { |
|
|
|
|
|
|
|
uint8_t h1[64]; |
|
|
|
|
|
|
|
uint32_t h4[16]; |
|
|
|
|
|
|
|
uint64_t h8[8]; |
|
|
|
|
|
|
|
} hash; |
|
|
|
|
|
|
|
//uint32_t buf[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
if (thread < threads) |
|
|
|
if (thread < threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
uint32_t nounce = startNounce + thread ; // original implementation |
|
|
|
uint32_t nounce = startNounce + thread ; // original implementation |
|
|
|
|
|
|
|
|
|
|
|
uint32_t buf[8]; |
|
|
|
uint32_t buf[8]; |
|
|
@ -344,56 +328,36 @@ uint64_t h8[8]; |
|
|
|
uint32_t in3[16]={0}; |
|
|
|
uint32_t in3[16]={0}; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 13 |
|
|
|
#pragma unroll 13 |
|
|
|
for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);} |
|
|
|
for (int i=0; i<13; i++) |
|
|
|
|
|
|
|
in2[i]= cuda_swab32(c_PaddedMessage80[i+16]); |
|
|
|
|
|
|
|
|
|
|
|
in2[13]=cuda_swab32(nounce); |
|
|
|
in2[13]=cuda_swab32(nounce); |
|
|
|
in2[14]=cuda_swab32(c_PaddedMessage80[30]); |
|
|
|
in2[14]=cuda_swab32(c_PaddedMessage80[30]); |
|
|
|
|
|
|
|
|
|
|
|
in3[15]=0x3d0; |
|
|
|
in3[15]=0x3d0; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (int i=0;i<8;i++) {buf[i]= pbuf[i];} |
|
|
|
for (int i=0; i<8; i++) |
|
|
|
|
|
|
|
buf[i] = pbuf[i]; |
|
|
|
|
|
|
|
|
|
|
|
sha2_round_body(in2,buf,K); |
|
|
|
sha2_round_body(in2,buf,K); |
|
|
|
sha2_round_body(in3,buf,K); |
|
|
|
sha2_round_body(in3,buf,K); |
|
|
|
//#pragma unroll 8 |
|
|
|
|
|
|
|
//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 4 |
|
|
|
#pragma unroll 4 |
|
|
|
for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);} |
|
|
|
for (int i=0; i<4; i++) { |
|
|
|
|
|
|
|
outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]); |
|
|
|
|
|
|
|
} |
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
} // thread |
|
|
|
} // threads |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ |
|
|
|
__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) |
|
|
|
void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce) |
|
|
|
{ |
|
|
|
{ |
|
|
|
/* |
|
|
|
|
|
|
|
__shared__ uint32_t Kshared[64]; |
|
|
|
|
|
|
|
if (threadIdx.x < 64) { |
|
|
|
|
|
|
|
Kshared[threadIdx.x]=K[threadIdx.x]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
if (thread < threads) |
|
|
|
if (thread < threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
union { |
|
|
|
|
|
|
|
uint8_t h1[304]; |
|
|
|
|
|
|
|
uint32_t h4[76]; |
|
|
|
|
|
|
|
uint64_t h8[38]; |
|
|
|
|
|
|
|
} hash; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t in[16],buf[8]; |
|
|
|
uint32_t in[16],buf[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);} |
|
|
|
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);} |
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
@ -415,20 +379,22 @@ uint64_t h8[38]; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
#pragma unroll 5 |
|
|
|
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);} |
|
|
|
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);} |
|
|
|
|
|
|
|
|
|
|
|
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread]; |
|
|
|
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread]; |
|
|
|
in[11]=0; |
|
|
|
in[11]=0; |
|
|
|
in[12]=0; |
|
|
|
in[12]=0; |
|
|
|
in[13]=0; |
|
|
|
in[13]=0; |
|
|
|
in[14]=0; |
|
|
|
in[14]=0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
in[15]=0x968; |
|
|
|
in[15]=0x968; |
|
|
|
|
|
|
|
|
|
|
|
int it=0; |
|
|
|
int it=0; |
|
|
|
|
|
|
|
|
|
|
|
do { |
|
|
|
do { |
|
|
|
in[15]-=8; |
|
|
|
in[15]-=8; |
|
|
|
it++; |
|
|
|
it++; |
|
|
|
} while (((uint8_t*)in)[44-it]==0); |
|
|
|
} while (((uint8_t*)in)[44-it]==0); |
|
|
|
|
|
|
|
|
|
|
|
((uint8_t*)in)[44-it+1]=0x80; |
|
|
|
((uint8_t*)in)[44-it+1]=0x80; |
|
|
|
|
|
|
|
|
|
|
|
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); |
|
|
|
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]); |
|
|
@ -438,73 +404,62 @@ uint64_t h8[38]; |
|
|
|
uint32_t nounce = startNounce +thread; |
|
|
|
uint32_t nounce = startNounce +thread; |
|
|
|
bool rc = false; |
|
|
|
bool rc = false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 4 |
|
|
|
#pragma unroll 4 |
|
|
|
for (int i = 0; i < 4; i++) |
|
|
|
for (int i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) { |
|
|
|
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) { |
|
|
|
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;} |
|
|
|
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) |
|
|
|
|
|
|
|
rc = true; |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
rc = false; |
|
|
|
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} |
|
|
|
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (rc && resNounce[0] > nounce) |
|
|
|
if(rc == true) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if(resNounce[0] > nounce) |
|
|
|
|
|
|
|
resNounce[0] = nounce; |
|
|
|
resNounce[0] = nounce; |
|
|
|
|
|
|
|
} // thread |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//// |
|
|
|
|
|
|
|
} // threads |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
|
|
|
|
void m7_sha256_cpu_init(int thr_id, int threads) |
|
|
|
__host__ void m7_sha256_cpu_init(int thr_id, int threads) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
// Kopiere die Hash-Tabellen in den GPU-Speicher |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice ); |
|
|
|
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice ); |
|
|
|
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice ); |
|
|
|
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice ); |
|
|
|
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t)); |
|
|
|
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t)); |
|
|
|
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t)); |
|
|
|
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) |
|
|
|
uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
const int threadsperblock = 384; |
|
|
|
uint32_t result = 0xffffffff; |
|
|
|
uint32_t result = 0xffffffff; |
|
|
|
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t)); |
|
|
|
|
|
|
|
const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t)); |
|
|
|
|
|
|
|
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]); |
|
|
|
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
|
|
|
|
|
|
|
|
result = *d_mnounce[thr_id]; |
|
|
|
result = *d_mnounce[thr_id]; |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
|
|
|
void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
const int threadsperblock = 512; |
|
|
|
|
|
|
|
|
|
|
|
const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// berechne wie viele Thread Blocks wir brauchen |
|
|
|
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
// dim3 grid(1); |
|
|
|
|
|
|
|
// dim3 block(1); |
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
|
|
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); |
|
|
|
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); |
|
|
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful |
|
|
|
__host__ |
|
|
|
|
|
|
|
void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful |
|
|
|
{ |
|
|
|
{ |
|
|
|
unsigned char PaddedMessage[128]; |
|
|
|
unsigned char PaddedMessage[128]; |
|
|
|
uint8_t ending =0x80; |
|
|
|
uint8_t ending =0x80; |
|
|
|