Browse Source

x11: adapt some blake 256 opts to 512 one

blake512: for the moment 6.2ms vs 7.12 before (+10%)
2upstream
Tanguy Pruvot 10 years ago
parent
commit
95ac1d0f19
  1. 9
      cuda_nist5.cu
  2. 156
      quark/cuda_quark_blake512.cu
  3. 31
      quark/quarkcoin.cu
  4. 18
      x11/x11.cu
  5. 4
      x15/x14.cu
  6. 31
      x15/x15.cu
  7. 18
      x17/x17.cu

9
cuda_nist5.cu

@ -5,9 +5,11 @@ extern "C"
#include "sph/sph_skein.h" #include "sph/sph_skein.h"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
}
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
}
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input)
memcpy(state, hash, 32); memcpy(state, hash, 32);
} }
extern bool opt_benchmark;
extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done) unsigned long *hashes_done)
@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x00FF;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];

156
quark/cuda_quark_blake512.cu

@ -50,59 +50,60 @@ const uint64_t c_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
}; };
#define G(a,b,c,d,e) \ #define G(a,b,c,d,x) { \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ uint32_t idx1 = sigma[i][x]; \
v[d] = ROTR( v[d] ^ v[a],32); \ uint32_t idx2 = sigma[i][x+1]; \
v[c] += v[d]; \ v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
v[b] = ROTR( v[b] ^ v[c],25); \ v[d] = ROTR( v[d] ^ v[a], 32); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ v[c] += v[d]; \
v[d] = ROTR( v[d] ^ v[a],16); \ v[b] = ROTR( v[b] ^ v[c], 25); \
v[c] += v[d]; \ v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
v[b] = ROTR( v[b] ^ v[c],11); v[d] = ROTR( v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c], 11); \
}
__device__ static __device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
{ {
uint64_t v[16], m[16], i; uint64_t v[16], m[16], i;
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; ++i ) { for( i = 0; i < 16; i++) {
m[i] = cuda_swab64(block[i]); m[i] = cuda_swab64(block[i]);
} }
#pragma unroll 8 #pragma unroll 8
for( i = 0; i < 8; ++i ) v[i] = h[i]; for (i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = u512[0];
v[ 9] = u512[1]; v[ 8] = u512[0];
v[10] = u512[2]; v[ 9] = u512[1];
v[11] = u512[3]; v[10] = u512[2];
v[12] = u512[4]; v[11] = u512[3];
v[13] = u512[5]; v[12] = u512[4] ^ T0;
v[14] = u512[6]; v[13] = u512[5] ^ T0;
v[15] = u512[7]; v[14] = u512[6];
v[15] = u512[7];
v[12] ^= bits;
v[13] ^= bits; //#pragma unroll 16
for( i = 0; i < 16; ++i )
//#pragma unroll 16 {
for( i = 0; i < 16; ++i ) /* column step */
{ G( 0, 4, 8, 12, 0 );
/* column step */ G( 1, 5, 9, 13, 2 );
G( 0, 4, 8, 12, 0 ); G( 2, 6, 10, 14, 4 );
G( 1, 5, 9, 13, 2 ); G( 3, 7, 11, 15, 6 );
G( 2, 6, 10, 14, 4 ); /* diagonal step */
G( 3, 7, 11, 15, 6 ); G( 0, 5, 10, 15, 8 );
/* diagonal step */ G( 1, 6, 11, 12, 10 );
G( 0, 5, 10, 15, 8 ); G( 2, 7, 8, 13, 12 );
G( 1, 6, 11, 12, 10 ); G( 3, 4, 9, 14, 14 );
G( 2, 7, 8, 13, 12 ); }
G( 3, 4, 9, 14, 14 );
} #pragma unroll 16
for( i = 0; i < 16; ++i )
#pragma unroll 16 h[i % 8] ^= v[i];
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
} }
__device__ __constant__ __device__ __constant__
@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = {
0x510e527fade682d1ULL, 0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL }; 0x5be0cd19137e2179ULL
};
// Hash-Padding // Hash-Padding
__device__ __constant__ __device__ __constant__
@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = {
0, 0,
0x0100000000000000ull, 0x0100000000000000ull,
0, 0,
0x0002000000000000ull }; 0x0002000000000000ull
};
__global__ __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
if (thread < threads) if (thread < threads)
#endif #endif
{ {
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
// 128 Byte für die Message // 128 Bytes
uint64_t buf[16]; uint64_t buf[16];
// State vorbereiten // State
uint64_t h[8]; uint64_t h[8];
#pragma unroll 8 #pragma unroll 8
for (i=0;i<8;i++) for (int i=0;i<8;i++)
h[i] = d_constMem[i]; h[i] = d_constMem[i];
// Message für die erste Runde in Register holen // Message for first round
#pragma unroll 8 #pragma unroll 8
for (i=0; i < 8; ++i) for (int i=0; i < 8; ++i)
buf[i] = inpHash[i]; buf[i] = inpHash[i];
#pragma unroll 8 #pragma unroll 8
for (i=0; i < 8; i++) for (int i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i]; buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde // Ending round
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ <= 350
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (i=0; i < 8; ++i) { for (int i=0; i < 8; i++) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition]; uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8 for (int i=0; i < 8; i++) {
for (i=0; i < 8; ++i)
{
outHash[i] = cuda_swab64(h[i]); outHash[i] = cuda_swab64(h[i]);
} }
#endif #endif
@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// State vorbereiten
uint64_t h[8]; uint64_t h[8];
// 128 Byte für die Message
uint64_t buf[16]; uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) for(int i=0; i<8; i++)
h[i] = d_constMem[i]; h[i] = d_constMem[i];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 16 #pragma unroll 16
for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; for (int i=0; i < 16; ++i)
buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen // The test Nonce
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); ((uint32_t*)buf)[19] = cuda_swab32(nounce);
// die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 640 ); quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
// Hash rauslassen #if __CUDA_ARCH__ <= 350
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8 #pragma unroll 8
for (i=0; i < 8; ++i) { for (uint32_t i=0; i < 8; i++) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8 for (uint32_t i=0; i < 8; i++) {
for (i=0; i < 8; ++i) {
outHash[i] = cuda_swab64( h[i] ); outHash[i] = cuda_swab64( h[i] );
} }
#endif #endif
} }
} }

31
quark/quarkcoin.cu

@ -6,12 +6,12 @@ extern "C"
#include "sph/sph_skein.h" #include "sph/sph_skein.h"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
}
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen // Speicher für Input/Output der verketteten Hashfunktionen
@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input)
unsigned char hash[64]; unsigned char hash[64];
sph_blake512_init(&ctx_blake); sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, input, 80); sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash); sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw); sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash); sph_bmw512_close(&ctx_bmw, (void*) hash);
if (hash[0] & 0x8) if (hash[0] & 0x8)
{ {
sph_groestl512_init(&ctx_groestl); sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash); sph_groestl512_close(&ctx_groestl, (void*) hash);
} }
else else
{ {
sph_skein512_init(&ctx_skein); sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash); sph_skein512_close(&ctx_skein, (void*) hash);
} }
sph_groestl512_init(&ctx_groestl); sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash); sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh); sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash); sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8) if (hash[0] & 0x8)
{ {
sph_blake512_init(&ctx_blake); sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64); sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash); sph_blake512_close(&ctx_blake, (void*) hash);
} }
else else
{ {
sph_bmw512_init(&ctx_bmw); sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash); sph_bmw512_close(&ctx_bmw, (void*) hash);
} }
sph_keccak512_init(&ctx_keccak); sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash); sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein); sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash); sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8) if (hash[0] & 0x8)
{ {
sph_keccak512_init(&ctx_keccak); sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash); sph_keccak512_close(&ctx_keccak, (void*) hash);
} }
else else
{ {
sph_jh512_init(&ctx_jh); sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash); sph_jh512_close(&ctx_jh, (void*) hash);
} }
@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input)
memcpy(state, hash, 32); memcpy(state, hash, 32);
} }
extern bool opt_benchmark;
extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done) unsigned long *hashes_done)
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const int throughput = 256*4096; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x00FF;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*4096; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id]) if (!init[thr_id])
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
const uint32_t Htarg = ptarget[7];
uint32_t vhash64[8]; uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);
quarkhash(vhash64, endiandata); quarkhash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce; pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2; *hashes_done = (foundNonce - first_nonce + 1)/2;

18
x11/x11.cu

@ -21,10 +21,9 @@ extern "C"
#include <memory.h> #include <memory.h>
} }
// aus cpu-miner.c // in cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8]; static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input)
} }
extern bool opt_benchmark;
extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done) unsigned long *hashes_done)
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id]) if (!init[thr_id])
{ {
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
cuda_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
uint32_t foundNonce; const uint32_t Htarg = ptarget[7];
int order = 0; int order = 0;
uint32_t foundNonce;
// Hash with CUDA // Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);

4
x15/x14.cu

@ -20,11 +20,11 @@ extern "C" {
#include "sph/sph_hamsi.h" #include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "sph/sph_shabal.h" #include "sph/sph_shabal.h"
}
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
}
// from cpu-miner.c // from cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input)
} }
extern bool opt_benchmark;
extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done) unsigned long *hashes_done)

31
x15/x15.cu

@ -21,14 +21,11 @@ extern "C" {
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "sph/sph_shabal.h" #include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h" #include "sph/sph_whirlpool.h"
}
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
// from cpu-miner.c // from cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// X15 CPU Hash function // X15 CPU Hash function
extern "C" void x15hash(void *output, const void *input) extern "C" void x15hash(void *output, const void *input)
{ {
@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input)
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }
#if NULLTEST
static void print_hash(unsigned char *hash)
{
for (int i=0; i < 32; i += 4) {
printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
}
}
#endif
extern bool opt_benchmark;
extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done) unsigned long *hashes_done)
@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
uint32_t Htarg = ptarget[7]; uint32_t Htarg = ptarget[7];
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; ((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
#if NULLTEST
for (int k=0; k < 20; k++)
pdata[k] = 0;
#endif
if (!init[thr_id]) if (!init[thr_id])
{ {
@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
#if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf);
CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n");
#endif
/* Scan with GPU */ /* Scan with GPU */
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

18
x17/x17.cu

@ -26,17 +26,15 @@ extern "C"
#include "sph/sph_sha2.h" #include "sph/sph_sha2.h"
#include "sph/sph_haval.h" #include "sph/sph_haval.h"
}
#include "miner.h" #include "miner.h"
} #include "cuda_helper.h"
static uint32_t *d_hash[8]; static uint32_t *d_hash[8];
// in cpu-miner.c
// cpu-miner.c
extern int device_map[8]; extern int device_map[8];
extern bool opt_benchmark;
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
unsigned long *hashes_done) unsigned long *hashes_done)
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const int throughput = 256*256*8; const int throughput = 256*256*8;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
static bool init[8] = {0,0,0,0,0,0,0,0}; static bool init[8] = {0,0,0,0,0,0,0,0};
uint32_t Htarg = ptarget[7]; uint32_t Htarg = ptarget[7];
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; ((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
if (!init[thr_id]) if (!init[thr_id])
{ {

Loading…
Cancel
Save