blake: final cleanup (225MH/s)

This commit is contained in:
Tanguy Pruvot 2014-09-11 18:15:33 +02:00
parent 347d4e4928
commit 8925a7551f

View File

@ -50,16 +50,19 @@ static uint32_t __align__(32) c_Target[8];
__constant__
static uint32_t __align__(32) c_data[20];
/* 8 adapters max (-t threads) */
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
static uint32_t extra_results[2] = { MAXU, MAXU };
/* max count of found nounces in one call */
#define NBN 2
static uint32_t extra_results[NBN-1] = { MAXU };
#define USE_CACHE 1
/* midstate hash cache, this algo is run on 2 parts */
#if USE_CACHE
__device__
static uint32_t cache[8];
__device__
static uint32_t prevsum = 0;
__device__ static uint32_t cache[8];
__device__ static uint32_t prevsum = 0;
#endif
/* prefer uint32_t to prevent size conversions = speed +5/10 % */
@ -104,30 +107,6 @@ static const uint32_t __align__(32) c_u256[16] = {
SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
};
#if 0
#define GS(m0, m1, c0, c1, a, b, c, d) do { \
a = SPH_T32(a + b + (m0 ^ c1)); \
d = SPH_ROTR32(d ^ a, 16); \
c = SPH_T32(c + d); \
b = SPH_ROTR32(b ^ c, 12); \
a = SPH_T32(a + b + (m1 ^ c0)); \
d = SPH_ROTR32(d ^ a, 8); \
c = SPH_T32(c + d); \
b = SPH_ROTR32(b ^ c, 7); \
} while (0)
#define ROUND_S(r) do { \
GS(Mx(r, 0x0), Mx(r, 0x1), CSx(r, 0x0), CSx(r, 0x1), v[0], v[4], v[0x8], v[0xC]); \
GS(Mx(r, 0x2), Mx(r, 0x3), CSx(r, 0x2), CSx(r, 0x3), v[1], v[5], v[0x9], v[0xD]); \
GS(Mx(r, 0x4), Mx(r, 0x5), CSx(r, 0x4), CSx(r, 0x5), v[2], v[6], v[0xA], v[0xE]); \
GS(Mx(r, 0x6), Mx(r, 0x7), CSx(r, 0x6), CSx(r, 0x7), v[3], v[7], v[0xB], v[0xF]); \
GS(Mx(r, 0x8), Mx(r, 0x9), CSx(r, 0x8), CSx(r, 0x9), v[0], v[5], v[0xA], v[0xF]); \
GS(Mx(r, 0xA), Mx(r, 0xB), CSx(r, 0xA), CSx(r, 0xB), v[1], v[6], v[0xB], v[0xC]); \
GS(Mx(r, 0xC), Mx(r, 0xD), CSx(r, 0xC), CSx(r, 0xD), v[2], v[7], v[0x8], v[0xD]); \
GS(Mx(r, 0xE), Mx(r, 0xF), CSx(r, 0xE), CSx(r, 0xF), v[3], v[4], v[0x9], v[0xE]); \
} while (0)
#endif
#define GS(a,b,c,d,x) { \
const uint32_t idx1 = c_sigma[i][x]; \
const uint32_t idx2 = c_sigma[i][x+1]; \
@ -240,22 +219,21 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
ending[3] = nounce; /* our tested value */
blake256_compress(h, ending, 640, blakerounds);
int pos = -1;
#pragma unroll 8
#if 0
for (int i = 7; i >= 0; i--) {
uint32_t hash = cuda_swab32(h[i]);
if (hash > c_Target[i]) {
return;
}
if (hash < c_Target[i]) {
/* dont ask me why, we lose 8MH/s in perfs
without the int variable */
if (pos < i) pos = i;
//break;
break;
}
}
#else
/* do not test all parts, fulltest() will do it */
if (cuda_swab32(h[7]) <= c_Target[7])
#endif
#if NBN == 2
/* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) {
resNounce[1] = resNounce[0];
@ -263,6 +241,9 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
}
else
resNounce[1] = nounce;
#else
resNounce[0] = nounce;
#endif
}
}
@ -277,15 +258,16 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
size_t shared_size = 0;
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
if (cudaMemset(d_resNounce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result;
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds, crcsum);
cudaDeviceSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
result = h_resNounce[thr_id][0];
extra_results[0] = h_resNounce[thr_id][1];
for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNounce[thr_id][n+1];
}
return result;
}
@ -309,6 +291,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
uint32_t crcsum = MAXU;
int rc = 0;
#if NBN > 1
if (extra_results[0] != MAXU) {
// possible extra result found in previous call
if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
@ -319,6 +302,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
goto exit_scan;
}
}
#endif
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;
@ -327,8 +311,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
if (opt_n_threads > 1) {
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
}
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t)));
init[thr_id] = true;
}
@ -375,14 +359,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
goto exit_scan;
}
else if (vhashcpu[7] > Htarg) {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
}
else if (vhashcpu[6] > ptarget[6]) {
applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
}
else {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
else if (opt_debug) {
applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
}
}
@ -397,15 +375,5 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
exit_scan:
*hashes_done = pdata[19] - first_nonce + 1;
#if 0
/* reset the device to allow multiple instances
* could be made in cpu-miner... check later if required */
if (opt_n_threads == 1) {
CUDA_SAFE_CALL(cudaDeviceReset());
init[thr_id] = false;
}
#endif
// wait proper end of all threads
//cudaDeviceSynchronize();
return rc;
}