Browse Source

m7: forgot to indent some cuda files

and remove unused variables...
Tanguy Pruvot 10 years ago
parent
commit
fcd381cda2
  1. 174
      m7/cuda_m7_sha256.cu
  2. 83
      m7/cuda_mul2.cu
  3. 74
      m7/cuda_ripemd160.cu
  4. 85
      m7/cuda_tiger192.cu
  5. 6
      m7/m7.cu
  6. 40
      m7/m7_keccak512.cu

174
m7/cuda_m7_sha256.cu

@ -1,10 +1,10 @@ @@ -1,10 +1,10 @@
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include "sph/sph_types.h"
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//#define SPH_C64(x) ((uint64_t)(x ## ULL))
@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = { @@ -67,28 +67,35 @@ static const uint32_t cpu_K[64] = {
};
static __device__ __forceinline__ uint32_t bsg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_0(uint32_t x)
{
uint32_t r1 = SPH_ROTR32(x,2);
uint32_t r2 = SPH_ROTR32(x,13);
uint32_t r3 = SPH_ROTR32(x,22);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t bsg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t bsg2_1(uint32_t x)
{
uint32_t r1 = SPH_ROTR32(x,6);
uint32_t r2 = SPH_ROTR32(x,11);
uint32_t r3 = SPH_ROTR32(x,25);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t ssg2_0(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_0(uint32_t x)
{
uint64_t r1 = SPH_ROTR32(x,7);
uint64_t r2 = SPH_ROTR32(x,18);
uint64_t r3 = shr_t32(x,3);
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
__device__ __forceinline__
static uint32_t ssg2_1(uint32_t x)
{
uint64_t r1 = SPH_ROTR32(x,17);
uint64_t r2 = SPH_ROTR32(x,19);
@ -96,7 +103,8 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x) @@ -96,7 +103,8 @@ static __device__ __forceinline__ uint32_t ssg2_1(uint32_t x)
return xor3b(r1,r2,r3);
}
static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
__device__ __forceinline__
static void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{
uint32_t t1,t2;
@ -111,12 +119,10 @@ d = d + t1; @@ -111,12 +119,10 @@ d = d + t1;
h = t1 + t2;
}
static __forceinline__ void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
__host__ __forceinline__
static void sha2_step1_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t in,const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
@ -129,7 +135,8 @@ d = d + t1; @@ -129,7 +135,8 @@ d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
__device__ __forceinline__
static void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{
uint32_t t1,t2;
@ -156,10 +163,10 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc]; @@ -156,10 +163,10 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __forceinline__ void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
__host__ __forceinline__
static void sha2_step2_host(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h,
uint32_t* in,uint32_t pc,const uint32_t Kshared)
{
uint32_t t1,t2;
@ -172,7 +179,6 @@ uint32_t inx1 = in[pcidx1]; @@ -172,7 +179,6 @@ uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
@ -186,14 +192,12 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc]; @@ -186,14 +192,12 @@ t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
__device__ __forceinline__
static void sha2_round_body(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
@ -221,8 +225,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r @@ -221,8 +225,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++) {
for (int i=0;i<3;i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -239,11 +243,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r @@ -239,11 +243,8 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
r[0] = r[0] + a;
r[1] = r[1] + b;
r[2] = r[2] + c;
@ -254,10 +255,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r @@ -254,10 +255,9 @@ static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r
r[7] = r[7] + h;
}
static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
__forceinline__
static void sha2_round_body_host(uint32_t* in, uint32_t* r,const uint32_t* Kshared)
{
uint32_t a=r[0];
uint32_t b=r[1];
uint32_t c=r[2];
@ -285,8 +285,8 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const @@ -285,8 +285,8 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll 3
for (int i=0;i<3;i++) {
for (int i=0;i<3;i++)
{
sha2_step2_host(a,b,c,d,e,f,g,h,in,0,Kshared[16+16*i]);
sha2_step2_host(h,a,b,c,d,e,f,g,in,1,Kshared[17+16*i]);
sha2_step2_host(g,h,a,b,c,d,e,f,in,2,Kshared[18+16*i]);
@ -303,7 +303,6 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const @@ -303,7 +303,6 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
sha2_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
r[0] = r[0] + a;
@ -316,27 +315,12 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const @@ -316,27 +315,12 @@ static __forceinline__ void sha2_round_body_host(uint32_t* in, uint32_t* r,const
r[7] = r[7] + h;
}
__global__ void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_sha256_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
//uint32_t buf[8];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread ; // original implementation
uint32_t buf[8];
@ -344,56 +328,36 @@ uint64_t h8[8]; @@ -344,56 +328,36 @@ uint64_t h8[8];
uint32_t in3[16]={0};
#pragma unroll 13
for (int i=0;i<13;i++) {in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);}
for (int i=0; i<13; i++)
in2[i]= cuda_swab32(c_PaddedMessage80[i+16]);
in2[13]=cuda_swab32(nounce);
in2[14]=cuda_swab32(c_PaddedMessage80[30]);
in3[15]=0x3d0;
#pragma unroll 8
for (int i=0;i<8;i++) {buf[i]= pbuf[i];}
for (int i=0; i<8; i++)
buf[i] = pbuf[i];
sha2_round_body(in2,buf,K);
sha2_round_body(in3,buf,K);
//#pragma unroll 8
//for (int i=0;i<8;i++) {hash.h4[i]=cuda_swab32(buf[i]);}
#pragma unroll 4
for (int i=0;i<4;i++) {outputHash[i*threads+thread]=cuda_swab32ll(((uint64_t*)buf)[i]);}
//////////////////////////////////////////////////////////////////////////////////////////////////
} // threads
for (int i=0; i<4; i++) {
outputHash[i*threads+thread] = cuda_swab32ll(((uint64_t*)buf)[i]);
}
} // thread
}
__global__ void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
__global__
void m7_sha256_gpu_hash_300(int threads, uint32_t startNounce, uint64_t *g_hash1, uint64_t *g_nonceVector, uint32_t *resNounce)
{
/*
__shared__ uint32_t Kshared[64];
if (threadIdx.x < 64) {
Kshared[threadIdx.x]=K[threadIdx.x];
}
__syncthreads();
*/
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
union {
uint8_t h1[304];
uint32_t h4[76];
uint64_t h8[38];
} hash;
uint32_t in[16],buf[8];
#pragma unroll 8
for (int i=0;i<8;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*i+thread]);}
#pragma unroll 8
@ -415,20 +379,22 @@ uint64_t h8[38]; @@ -415,20 +379,22 @@ uint64_t h8[38];
#pragma unroll 5
for (int i=0;i<5;i++) {((uint64_t*)in)[i]= cuda_swab32ll(g_hash1[threads*(i+32)+thread]);}
((uint64_t*)in)[5] = g_hash1[threads*(5+32)+thread];
in[11]=0;
in[12]=0;
in[13]=0;
in[14]=0;
in[15]=0x968;
int it=0;
do {
in[15]-=8;
it++;
} while (((uint8_t*)in)[44-it]==0);
((uint8_t*)in)[44-it+1]=0x80;
((uint64_t*)in)[5]= cuda_swab32ll(((uint64_t*)in)[5]);
@ -438,73 +404,62 @@ uint64_t h8[38]; @@ -438,73 +404,62 @@ uint64_t h8[38];
uint32_t nounce = startNounce +thread;
bool rc = false;
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
if (cuda_swab32ll(((uint64_t*)buf)[i]) != ((uint64_t*)pTarget)[i]) {
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i]) {rc = true;} else {rc = false;}
if (cuda_swab32ll(((uint64_t*)buf)[i]) < ((uint64_t*)pTarget)[i])
rc = true;
else
rc = false;
//if cuda_swab32(((uint64_t*)buf)[3]) < ((uint64_t*)pTarget)[3]) {rc = true;}
}
}
if(rc == true)
{
if(resNounce[0] > nounce)
if (rc && resNounce[0] > nounce)
resNounce[0] = nounce;
}
////
} // threads
} // thread
}
__host__ void m7_sha256_cpu_init(int thr_id, int threads)
__host__
void m7_sha256_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( H256,cpu_H256,sizeof(cpu_H256),0, cudaMemcpyHostToDevice );
cudaMemcpyToSymbol( K,cpu_K,sizeof(cpu_K),0, cudaMemcpyHostToDevice );
cudaMalloc(&d_MNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&d_mnounce[thr_id], 1*sizeof(uint32_t));
}
__host__ uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
__host__
uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector,uint64_t *d_hash, int order)
{
const int threadsperblock = 384;
uint32_t result = 0xffffffff;
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 384; // Alignment mit mixtob Grösse. NICHT ÄNDERN
cudaMemset(d_MNonce[thr_id], 0xff, sizeof(uint32_t));
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
m7_sha256_gpu_hash_300<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector, d_MNonce[thr_id]);
cudaMemcpy(d_mnounce[thr_id], d_MNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
MyStreamSynchronize(NULL, order, thr_id);
result = *d_mnounce[thr_id];
return result;
}
__host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 512;
const int threadsperblock = 512; // Alignment mit mixtob Grösse. NICHT ÄNDERN
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// dim3 grid(1);
// dim3 block(1);
size_t shared_size = 0;
m7_sha256_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun @@ -512,7 +467,8 @@ __host__ void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNoun
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
__host__
void m7_sha256_setBlock_120(void *pdata,const void *ptarget) //not useful
{
unsigned char PaddedMessage[128];
uint8_t ending =0x80;

83
m7/cuda_mul2.cu

@ -85,7 +85,8 @@ ulonglong2 umul64wide (unsigned long long int a, @@ -85,7 +85,8 @@ ulonglong2 umul64wide (unsigned long long int a,
}
__device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
__device__ __forceinline__
void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n)
{
asm ("{\n\t"
".reg .u32 o0, o1, o2, o3, o4; \n\t"
@ -187,7 +188,8 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){ @@ -187,7 +188,8 @@ __device__ __forceinline__ void umul_ppmmT4(t4_t *h, t4_t *l, t4_t m, t4_t n){
#endif
__device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
__device__ __forceinline__
t4_t T4(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g){
t4_t ret;
ret.high = g[(idx*2 + 1)*threads + thread];
ret.low = g[(idx*2)*threads + thread];
@ -199,7 +201,8 @@ __device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t i @@ -199,7 +201,8 @@ __device__ __forceinline__ t4_t T4(uint32_t thread, uint32_t threads, uint32_t i
return ret;
}
__device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
__device__ __forceinline__
void T4_store(uint32_t thread, uint32_t threads, uint32_t idx, uint64_t *g, t4_t val){
g[(idx*2 + 1)*threads + thread]=val.high;
g[(idx*2)*threads + thread]=val.low;
@ -209,12 +212,14 @@ __device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint @@ -209,12 +212,14 @@ __device__ __forceinline__ void T4_store(uint32_t thread, uint32_t threads, uint
}
__device__ __forceinline__ void T4_set(t4_t *d, uint64_t v){
__device__ __forceinline__
void T4_set(t4_t *d, uint64_t v){
d->high = 0;
d->low = v;
}
__device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
__device__ __forceinline__
t4_t T4_add(t4_t a, t4_t b){
t4_t ret;
uint32_t c=0;
ret.low = a.low + b.low;
@ -224,7 +229,8 @@ __device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){ @@ -224,7 +229,8 @@ __device__ __forceinline__ t4_t T4_add(t4_t a, t4_t b){
return ret;
}
__device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
__device__ __forceinline__
t4_t T4_add(uint64_t a, t4_t b){
t4_t ret;
uint32_t c=0;
ret.low = a + b.low;
@ -234,8 +240,8 @@ __device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){ @@ -234,8 +240,8 @@ __device__ __forceinline__ t4_t T4_add(uint64_t a, t4_t b){
return ret;
}
__device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
__device__ __forceinline__
uint32_t T4_lt(t4_t a, t4_t b){
if(a.high < b.high)
return 1;
if(a.high == b.high && a.low < b.low)
@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){ @@ -243,7 +249,8 @@ __device__ __forceinline__ uint32_t T4_lt(t4_t a, t4_t b){
return 0;
}
__device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
__device__ __forceinline__
uint32_t T4_gt(t4_t a, uint64_t b){
if(a.high)
return 1;
if(a.low > b)
@ -252,7 +259,8 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){ @@ -252,7 +259,8 @@ __device__ __forceinline__ uint32_t T4_gt(t4_t a, uint64_t b){
}
__device__ void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
__device__
void mulScalarT4(uint32_t thread, uint32_t threads, uint32_t len, uint64_t* g_p, uint64_t* g_v, t4_t sml, uint32_t *size){
t4_t ul, cl, hpl, lpl;
uint32_t i;
T4_set(&cl,0);
@ -289,13 +297,14 @@ __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint6 @@ -289,13 +297,14 @@ __device__ void mulScalar(uint32_t thread, uint32_t threads, uint32_t len, uint6
*size = len + (cl != 0);
}
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a){
uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, uint64_t a)
{
uint64_t carry=0;
uint32_t i;
uint64_t ul,lpl,hpl,rl;
for(i=0; i < xsz; i++){
for(i=0; i < xsz; i++)
{
ul = x[i*threads + thread];
umul_ppmm (hpl, lpl, ul, a);
@ -311,13 +320,15 @@ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum, @@ -311,13 +320,15 @@ uint64_t __device__ addmul_1g (uint32_t thread, uint32_t threads, uint64_t *sum,
return carry;
}
t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a){
__device__
t4_t addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, uint32_t sofst, uint64_t *x, uint64_t xsz, t4_t a)
{
t4_t carry;
uint32_t i;
t4_t ul,lpl,hpl,rl;
T4_set(&carry,0);
for(i=0; i < xsz; i++){
for(i=0; i < xsz; i++)
{
ul = T4(thread,threads,i,x);
umul_ppmmT4 (&hpl, &lpl, ul, a);
@ -333,9 +344,8 @@ t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, u @@ -333,9 +344,8 @@ t4_t __device__ addmul_1gT4 (uint32_t thread, uint32_t threads, uint64_t *sum, u
return carry;
}
__global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__
void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -354,7 +364,6 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g @@ -354,7 +364,6 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
mulScalar(thread,threads,ulegs,g_p,g_u,g_v[thread],&psize);
#if 1
while (vofst < vlegs) {
//clear high word //TODO: right
// printf("Size: %d\n", rp->size[tid]);
@ -368,19 +377,17 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g @@ -368,19 +377,17 @@ __global__ void gpu_mul(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g
// if(D_REF(rp->d,up->size[tid] + vp->size[tid] - 1,tid) != (uint64_t)0)
// rp->size[tid]++;
#endif
}
}
__global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
__global__
void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t *g_u, uint64_t *g_v, uint64_t *g_p)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
if(ulegs < vlegs){ ///everything written the other way around... are you kidding me ?!
if(ulegs < vlegs) { // everything written the other way around... are you kidding me ?!
uint64_t t1=ulegs;
ulegs = vlegs;
vlegs = t1;
@ -396,8 +403,6 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t @@ -396,8 +403,6 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
// cuPrintf("U: %d V: %d\n", ulegs, vlegs);
}
uint32_t vofst=1,rofst=1,psize=0;
mulScalarT4(thread,threads,ulegs,g_p,g_u,T4(thread,threads,0,g_v),&psize);
@ -405,21 +410,17 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t @@ -405,21 +410,17 @@ __global__ void gpu_mulT4(int threads, uint32_t ulegs, uint32_t vlegs, uint64_t
t4_t zero;
T4_set(&zero,0);
// while (vofst < vlegs) {
#pragma unroll
for (vofst=1;vofst<vlegs;vofst++) {
for (vofst=1;vofst<vlegs;vofst++)
{
T4_store(thread,threads,psize,g_p,zero);
T4_store(thread,threads,ulegs+rofst,g_p, addmul_1gT4(thread, threads, g_p, rofst, g_u, ulegs, T4(thread,threads,vofst,g_v)));
// vofst++;
rofst++;
psize++;
}
#endif
}
}
@ -434,10 +435,9 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) @@ -434,10 +435,9 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
}
}
__host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
__host__
void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p,int order)
{
const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
// berechne wie viele Thread Blocks wir brauchen
@ -445,13 +445,13 @@ __host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, u @@ -445,13 +445,13 @@ __host__ void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, u
dim3 block(threadsperblock);
size_t shared_size = 0;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mul<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
}
__host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
__host__
void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order)
{
const int threadsperblock = 256; // better occupancy (for both 780 and 750 ti's)
// berechne wie viele Thread Blocks wir brauchen
@ -459,10 +459,13 @@ __host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, @@ -459,10 +459,13 @@ __host__ void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs,
dim3 block(threadsperblock);
size_t shared_size = 0;
//gpu_mulT4<<<grid, block, shared_size>>>(threads, alegs, blegs, g_a, g_b, g_p) ;
gpu_mulT4<<<grid, block, shared_size>>>(threads, blegs, alegs, g_b, g_a, g_p) ;
}
__host__ void mul_init(){
__host__
void mul_init()
{
}

74
m7/cuda_ripemd160.cu

@ -282,14 +282,12 @@ static const uint32_t IV[5] = { @@ -282,14 +282,12 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \
}
__global__ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
@ -308,51 +306,52 @@ uint64_t h8[8]; @@ -308,51 +306,52 @@ uint64_t h8[8];
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
// #pragma unroll 16
// for (int i=0;i<16;i++) {in[i]= c_PaddedMessage80[i];}
#pragma unroll 16
for (int i=0;i<16;i++) {if ((i+16)<29) {in2[i]= c_PaddedMessage80[i+16];}
else if ((i+16)==29) {in2[i]= nounce;}
else if ((i+16)==30) {in2[i]= c_PaddedMessage80[i+16];}
else {in2[i]= 0;}}
for (int i=0;i<16;i++) {
if ((i+16) < 29)
in2[i] = c_PaddedMessage80[i+16];
else if ((i+16)==29)
in2[i] = nounce;
else if ((i+16)==30)
in2[i] = c_PaddedMessage80[i+16];
else
in2[i] = 0;
}
#pragma unroll 16
for (int i=0;i<16;i++) {in3[i]=0;}
for (int i=0;i<16;i++)
in3[i]=0;
in3[14]=0x3d0;
// #pragma unroll 5
// for (int i=0;i<5;i++) {buf[i]=gpu_IV[i];}
#pragma unroll 5
for (int i=0;i<5;i++) {buf[i]=bufo[i];}
// RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
for (int i=0;i<5;i++)
buf[i]=bufo[i];
RIPEMD160_ROUND_BODY(in2, buf);
RIPEMD160_ROUND_BODY(in3, buf);
hash.h4[5]=0;
#pragma unroll 5
for (int i=0; i<5; i++)
{hash.h4[i]=buf[i];
}
//uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
//#pragma unroll 3
//for (int i=0;i<3;i++) {outHash[i]=hash.h8[i];}
hash.h4[i]=buf[i];
#pragma unroll 3
for (int i=0;i<3;i++) {outputHash[i*threads+thread]=hash.h8[i];}
//#pragma unroll 8
//for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=hash.h8[i];} else {outputHash[i*threads+thread]=0;}}
for (int i=0;i<3;i++) {
outputHash[i*threads+thread] = hash.h8[i];
}
}
}
void ripemd160_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol(gpu_IV,IV,sizeof(IV),0, cudaMemcpyHostToDevice);
}
__host__ void ripemd160_setBlock_120(void *pdata)
__host__
void ripemd160_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x80;
@ -371,29 +370,28 @@ __host__ void ripemd160_setBlock_120(void *pdata) @@ -371,29 +370,28 @@ __host__ void ripemd160_setBlock_120(void *pdata)
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
uint32_t* alt_data =(uint32_t*)pdata;
uint32_t in[16],buf[5];
for (int i=0;i<16;i++)
in[i]= alt_data[i];
for (int i=0;i<16;i++) {in[i]= alt_data[i];}
for (int i=0;i<5;i++) {buf[i]=IV[i];}
for (int i=0;i<5;i++)
buf[i]=IV[i];
RIPEMD160_ROUND_BODY(in, buf); //no need to calculate it several time (need to moved)
cudaMemcpyToSymbol(bufo, buf, 5*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}
__host__ void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size =0;
m7_ripemd160_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);

85
m7/cuda_tiger192.cu

@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -50,11 +50,13 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__constant__ uint64_t bufo[3];
static __constant__ uint64_t gpu_III[3];
static __constant__ uint64_t T1[256];
static __constant__ uint64_t T2[256];
static __constant__ uint64_t T3[256];
static __constant__ uint64_t T4[256];
static const uint64_t III[3] = {
SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187)
};
@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = { @@ -649,29 +651,24 @@ static const uint64_t cpu_T4[256] = {
(r)[2] = SPH_T64(C + (r)[2]); \
}
__global__ void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__
void m7_tiger192_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
__shared__ uint64_t sharedMem[1024];
if(threadIdx.x < 256)
{
if(threadIdx.x < 256) {
sharedMem[threadIdx.x] = T1[threadIdx.x];
sharedMem[threadIdx.x+256] = T2[threadIdx.x];
sharedMem[threadIdx.x+512] = T3[threadIdx.x];
sharedMem[threadIdx.x+768] = T4[threadIdx.x];
}
__syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
/*
#undef MUL5
#undef MUL7
@ -691,8 +688,6 @@ uint64_t h8[8]; @@ -691,8 +688,6 @@ uint64_t h8[8];
ROUND(b, c, a, X7, mul); \
}
#define ROUND(a, b, c, x, mul) { \
c ^= x; \
a = SPH_T64(a - (sharedMem[c & 0xFF] ^ sharedMem[((c >> 16) & 0xFF)+256] \
@ -702,64 +697,72 @@ uint64_t h8[8]; @@ -702,64 +697,72 @@ uint64_t h8[8];
b = mul(b); \
}
uint64_t in[8],buf[3];
uint64_t in2[8],in3[8];
uint64_t in2[8];
#pragma unroll 8
for (int i=0;i<8;i++) {in2[i]= c_PaddedMessage80[i+8];}
for (int i=0; i<8; i++)
in2[i] = c_PaddedMessage80[i+8];
uint32_t* Mess = (uint32_t*)in2;
Mess[13] = nounce;
uint64_t in3[8];
#pragma unroll 8
for (int i=0;i<8;i++) {in3[i]=0;}
for (int i=0; i<8; i++)
in3[i]=0;
in3[7]=0x3d0;
#pragma unroll 3
for (int i=0;i<3;i++) {buf[i]=bufo[i];}
uint64_t buf[3];
#pragma unroll 3
for (int i=0; i<3; i++)
buf[i]=bufo[i];
TIGER_ROUND_BODY(in2, buf);
TIGER_ROUND_BODY(in3, buf);
#pragma unroll 8
for (int i=0;i<8;i++) { if (i<3) {outputHash[i*threads+thread]=buf[i];} else {outputHash[i*threads+thread]=0;}}
} //// threads
for (int i=0;i<8;i++) {
if (i<3) {
outputHash[i*threads+thread] = buf[i];
} else {
outputHash[i*threads+thread] = 0;
}
}
} // thread
}
__host__
void tiger192_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol(gpu_III,III,sizeof(III),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T1,cpu_T1,sizeof(cpu_T1),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T2,cpu_T2,sizeof(cpu_T2),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T3,cpu_T3,sizeof(cpu_T3),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(T4,cpu_T4,sizeof(cpu_T4),0, cudaMemcpyHostToDevice);
}
__host__ void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
__host__
void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 640; // Alignment mit mixtab Grösse. NICHT ÄNDERN
// const int threadsperblock = 256;
const int threadsperblock = 640; // 256
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
//dim3 grid(1);
//dim3 block(1);
size_t shared_size = 0;
m7_tiger192_gpu_hash_120<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void tiger192_setBlock_120(void *pdata)
__host__
void tiger192_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5); //useless
@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata) @@ -782,14 +785,16 @@ __host__ void tiger192_setBlock_120(void *pdata)
b = mul(b); \
}
uint64_t* alt_data = (uint64_t*) pdata;
uint64_t in[8],buf[3];
for (int i=0;i<8;i++) {in[i]= alt_data[i];}
for (int i=0;i<3;i++) {buf[i]=III[i];}
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol( bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
for (int i=0;i<8;i++)
in[i] = alt_data[i];
for (int i=0;i<3;i++)
buf[i] = III[i];
TIGER_ROUND_BODY(in, buf)
cudaMemcpyToSymbol(bufo, buf, 3*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}

6
m7/m7.cu

@ -22,7 +22,7 @@ extern "C" @@ -22,7 +22,7 @@ extern "C"
extern int device_map[8];
extern bool opt_benchmark;
static uint64_t *d_hash[8];
//static uint64_t *d_hash[8];
static uint64_t *FinalHash[8];
static uint64_t *KeccakH[8];
static uint64_t *WhirlpoolH[8];
@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de @@ -112,11 +112,9 @@ extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int de
{
// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)
char data_str[245], hash_str[65], target_str[65];
uint8_t *bdata = 0;
mpz_t bns[7];
mpz_t product;
int rc = 0;
for(int i=0; i < 7; i++) {
mpz_init(bns[i]);
@ -292,7 +290,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata, @@ -292,7 +290,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
tiger192_setBlock_120((void*)pdata);
cuda_check_cpu_setTarget(ptarget);
uint32_t TheNonce = pdata[29];
do {
int order = 0;
@ -328,7 +325,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata, @@ -328,7 +325,6 @@ extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
MyStreamSynchronize(0,order++,thr_id);
m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
MyStreamSynchronize(0,order++,thr_id);
foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);

40
m7/m7_keccak512.cu

@ -5,6 +5,8 @@ @@ -5,6 +5,8 @@
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
static __constant__ uint64_t stateo[25];
static __constant__ uint64_t RC[24];
static const uint64_t cpu_RC[24] = {
@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = { @@ -22,7 +24,9 @@ static const uint64_t cpu_RC[24] = {
0x0000000080000001ull, 0x8000000080008008ull
};
static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
__device__ __forceinline__
static void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint64_t t[5], u[5], v, w;
@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t @@ -136,8 +140,9 @@ static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint64_t
}
}
static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants) {
__host__ __forceinline__
static void keccak_block_host(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint64_t t[5], u[5], v, w;
@ -204,19 +209,12 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca @@ -204,19 +209,12 @@ static __forceinline__ void keccak_block_host(uint64_t *s, const uint64_t *kecca
}
}
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
__global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
__global__ /* __launch_bounds__(256, 2) */
void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint64_t state[25];
@ -237,38 +235,36 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin @@ -237,38 +235,36 @@ __global__ void m7_keccak512_gpu_hash_120(int threads, uint32_t startNounce, uin
keccak_block(state,RC);
#pragma unroll 8
for (int i=0;i<8;i++) {outputHash[i*threads+thread]=state[i];}
for (int i=0;i<8;i++) {
outputHash[i*threads+thread] = state[i];
}
} //thread
}
void m7_keccak512_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol( RC,cpu_RC,sizeof(cpu_RC),0,cudaMemcpyHostToDevice);
}
__host__ void m7_keccak512_setBlock_120(void *pdata)
{
unsigned char PaddedMessage[128];
uint8_t ending =0x01;
memcpy(PaddedMessage, pdata, 122);
memset(PaddedMessage+122,ending,1);
memset(PaddedMessage+123, 0, 5);
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
uint64_t* alt_data = (uint64_t*) pdata;
uint64_t state[25];
for(int i=0;i<25;i++) {state[i]=0;}
for (int i=0;i<9;i++) {state[i] ^= alt_data[i];}
for(int i=0;i<9;i++)
state[i] = alt_data[i];
for(int i=10;i<25;i++)
state[i] = 0;
keccak_block_host(state,cpu_RC);
cudaMemcpyToSymbol(stateo, state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}

Loading…
Cancel
Save